
     `i|                    0   d Z ddlZddlmZ ddlmZmZ ddlZddlmZ ddl	m
Z
mZmZ ddlmZmZ dd	lmZ dd
lmZmZ ddlmZ ddlmZ ddlmZmZmZ ddlmZ  ej        e           Z!e ed           G d de                                  Z"e ed           G d de                                  Z#e ed           G d de                                  Z$e ed           G d de                                  Z%e ed           G d de                                  Z&e ed           G d  d!e                                  Z'e ed"           G d# d$e                                  Z(e ed%           G d& d'e                                  Z)e ed(           G d) d*e                                  Z*e ed+           G d, d-e                                  Z+ G d. d/ej,                  Z- G d0 d1ej,                  Z. G d2 d3ej,                  Z/ G d4 d5ej,                  Z0 G d6 d7ej,                  Z1 G d8 d9ej,                  Z2 G d: d;ej,                  Z3 G d< d=e          Z4 G d> d?ej,                  Z5 G d@ dAej,                  Z6 G dB dCej,                  Z7 G dD dEej,                  Z8e G dF dGe                      Z9 edH           G dI dJe9                      Z:dK Z; G dL dMej,                  Z< edN           G dO dPe9                      Z= edQ           G dR dSe9                      Z> edT           G dU dVe9                      Z? edW           G dX dYe9                      Z@ edZ           G d[ d\e9                      ZA ed]           G d^ d_e9                      ZBe G d` dae9                      ZCe G db dce9                      ZDg ddZEdS )ezPyTorch LUKE model.    N)	dataclass)OptionalUnion)nn)BCEWithLogitsLossCrossEntropyLossMSELoss   )ACT2FNgelu)GradientCheckpointingLayer)BaseModelOutputBaseModelOutputWithPooling)PreTrainedModel)apply_chunking_to_forward)ModelOutputauto_docstringlogging   )
LukeConfigz3
    Base class for outputs of the LUKE model.
    )custom_introc                   l    e Zd ZU dZdZeej                 ed<   dZ	ee
ej        df                  ed<   dS )BaseLukeModelOutputWithPoolingax  
    pooler_output (`torch.FloatTensor` of shape `(batch_size, hidden_size)`):
        Last layer hidden-state of the first token of the sequence (classification token) further processed by a
        Linear layer and a Tanh activation function.
    entity_last_hidden_state (`torch.FloatTensor` of shape `(batch_size, entity_length, hidden_size)`):
        Sequence of entity hidden-states at the output of the last layer of the model.
    entity_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
        Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
        shape `(batch_size, entity_length, hidden_size)`. Entity hidden-states of the model at the output of each
        layer plus the initial entity embedding outputs.
    Nentity_last_hidden_state.entity_hidden_states__name__
__module____qualname____doc__r   r   torchFloatTensor__annotations__r   tuple     z/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/transformers/models/luke/modeling_luke.pyr   r   %   s[         
 
 =Ahu'89@@@DH(5):C)?#@AHHHHHr&   r   zV
    Base class for model's outputs, with potential hidden states and attentions.
    c                   l    e Zd ZU dZdZeej                 ed<   dZ	ee
ej        df                  ed<   dS )BaseLukeModelOutputa  
    entity_last_hidden_state (`torch.FloatTensor` of shape `(batch_size, entity_length, hidden_size)`):
        Sequence of entity hidden-states at the output of the last layer of the model.
    entity_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
        Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
        shape `(batch_size, entity_length, hidden_size)`. Entity hidden-states of the model at the output of each
        layer plus the initial entity embedding outputs.
    Nr   .r   r   r%   r&   r'   r)   r)   <   s[           =Ahu'89@@@DH(5):C)?#@AHHHHHr&   r)   c                   `   e Zd ZU dZdZeej                 ed<   dZ	eej                 ed<   dZ
eej                 ed<   dZeej                 ed<   dZeej                 ed<   dZeeej                          ed<   dZeeej        d	f                  ed
<   dZeeej        d	f                  ed<   dS )LukeMaskedLMOutputa:  
    loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
        The sum of masked language modeling (MLM) loss and entity prediction loss.
    mlm_loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
        Masked language modeling (MLM) loss.
    mep_loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
        Masked entity prediction (MEP) loss.
    logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
        Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
    entity_logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
        Prediction scores of the entity prediction head (scores for each entity vocabulary token before SoftMax).
    entity_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
        Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
        shape `(batch_size, entity_length, hidden_size)`. Entity hidden-states of the model at the output of each
        layer plus the initial entity embedding outputs.
    Nlossmlm_lossmep_losslogitsentity_logitshidden_states.r   
attentions)r   r   r   r    r,   r   r!   r"   r#   r-   r.   r/   r0   r1   r$   r   r2   r%   r&   r'   r+   r+   P   s          " )-D(5$
%,,,,0Hhu()000,0Hhu()000*.FHU&'...15M8E-.5558<M8E%"345<<<DH(5):C)?#@AHHH:>Ju0#567>>>>>r&   r+   z2
    Outputs of entity classification models.
    c                       e Zd ZU dZdZeej                 ed<   dZ	eej                 ed<   dZ
eeej        df                  ed<   dZeeej        df                  ed<   dZeeej        df                  ed<   dS )	EntityClassificationOutput  
    loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
        Classification loss.
    logits (`torch.FloatTensor` of shape `(batch_size, config.num_labels)`):
        Classification scores (before SoftMax).
    entity_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
        Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
        shape `(batch_size, entity_length, hidden_size)`. Entity hidden-states of the model at the output of each
        layer plus the initial entity embedding outputs.
    Nr,   r/   .r1   r   r2   r   r   r   r    r,   r   r!   r"   r#   r/   r1   r$   r   r2   r%   r&   r'   r4   r4   r            	 	 )-D(5$
%,,,*.FHU&'...=AM8E%"3S"89:AAADH(5):C)?#@AHHH:>Ju0#567>>>>>r&   r4   z7
    Outputs of entity pair classification models.
    c                       e Zd ZU dZdZeej                 ed<   dZ	eej                 ed<   dZ
eeej        df                  ed<   dZeeej        df                  ed<   dZeeej        df                  ed<   dS )	EntityPairClassificationOutputr5   Nr,   r/   .r1   r   r2   r6   r%   r&   r'   r9   r9      r7   r&   r9   z7
    Outputs of entity span classification models.
    c                       e Zd ZU dZdZeej                 ed<   dZ	eej                 ed<   dZ
eeej        df                  ed<   dZeeej        df                  ed<   dZeeej        df                  ed<   dS )	EntitySpanClassificationOutputa  
    loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
        Classification loss.
    logits (`torch.FloatTensor` of shape `(batch_size, entity_length, config.num_labels)`):
        Classification scores (before SoftMax).
    entity_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
        Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
        shape `(batch_size, entity_length, hidden_size)`. Entity hidden-states of the model at the output of each
        layer plus the initial entity embedding outputs.
    Nr,   r/   .r1   r   r2   r6   r%   r&   r'   r;   r;      r7   r&   r;   z4
    Outputs of sentence classification models.
    c                       e Zd ZU dZdZeej                 ed<   dZ	eej                 ed<   dZ
eeej        df                  ed<   dZeeej        df                  ed<   dZeeej        df                  ed<   dS )	LukeSequenceClassifierOutputa  
    loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
        Classification (or regression if config.num_labels==1) loss.
    logits (`torch.FloatTensor` of shape `(batch_size, config.num_labels)`):
        Classification (or regression if config.num_labels==1) scores (before SoftMax).
    entity_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
        Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
        shape `(batch_size, entity_length, hidden_size)`. Entity hidden-states of the model at the output of each
        layer plus the initial entity embedding outputs.
    Nr,   r/   .r1   r   r2   r6   r%   r&   r'   r=   r=      r7   r&   r=   z@
    Base class for outputs of token classification models.
    c                       e Zd ZU dZdZeej                 ed<   dZ	eej                 ed<   dZ
eeej        df                  ed<   dZeeej        df                  ed<   dZeeej        df                  ed<   dS )	LukeTokenClassifierOutputa  
    loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
        Classification loss.
    logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.num_labels)`):
        Classification scores (before SoftMax).
    entity_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
        Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
        shape `(batch_size, entity_length, hidden_size)`. Entity hidden-states of the model at the output of each
        layer plus the initial entity embedding outputs.
    Nr,   r/   .r1   r   r2   r6   r%   r&   r'   r?   r?      r7   r&   r?   z/
    Outputs of question answering models.
    c                      e Zd ZU dZdZeej                 ed<   dZ	eej                 ed<   dZ
eej                 ed<   dZeeej        df                  ed<   dZeeej        df                  ed<   dZeeej        df                  ed	<   dS )
 LukeQuestionAnsweringModelOutputa  
    loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
        Total span extraction loss is the sum of a Cross-Entropy for the start and end positions.
    entity_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
        Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
        shape `(batch_size, entity_length, hidden_size)`. Entity hidden-states of the model at the output of each
        layer plus the initial entity embedding outputs.
    Nr,   start_logits
end_logits.r1   r   r2   )r   r   r   r    r,   r   r!   r"   r#   rB   rC   r1   r$   r   r2   r%   r&   r'   rA   rA      s           )-D(5$
%,,,04L(5,-444.2J*+222=AM8E%"3S"89:AAADH(5):C)?#@AHHH:>Ju0#567>>>>>r&   rA   z,
    Outputs of multiple choice models.
    c                       e Zd ZU dZdZeej                 ed<   dZ	eej                 ed<   dZ
eeej        df                  ed<   dZeeej        df                  ed<   dZeeej        df                  ed<   dS )	LukeMultipleChoiceModelOutputa  
    loss (`torch.FloatTensor` of shape *(1,)*, *optional*, returned when `labels` is provided):
        Classification loss.
    logits (`torch.FloatTensor` of shape `(batch_size, num_choices)`):
        *num_choices* is the second dimension of the input tensors. (see *input_ids* above).

        Classification scores (before SoftMax).
    entity_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
        Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
        shape `(batch_size, entity_length, hidden_size)`. Entity hidden-states of the model at the output of each
        layer plus the initial entity embedding outputs.
    Nr,   r/   .r1   r   r2   r6   r%   r&   r'   rE   rE     s           )-D(5$
%,,,*.FHU&'...=AM8E%"3S"89:AAADH(5):C)?#@AHHH:>Ju0#567>>>>>r&   rE   c                   8     e Zd ZdZ fdZ	 	 	 	 ddZd Z xZS )LukeEmbeddingszV
    Same as BertEmbeddings with a tiny tweak for positional embeddings indexing.
    c                 "   t                                                       t          j        |j        |j        |j                  | _        t          j        |j        |j                  | _	        t          j        |j
        |j                  | _        t          j        |j        |j                  | _        t          j        |j                  | _        |j        | _        t          j        |j        |j        | j                  | _	        d S )Npadding_idxeps)super__init__r   	Embedding
vocab_sizehidden_sizepad_token_idword_embeddingsmax_position_embeddingsposition_embeddingstype_vocab_sizetoken_type_embeddings	LayerNormlayer_norm_epsDropouthidden_dropout_probdropoutrJ   selfconfig	__class__s     r'   rN   zLukeEmbeddings.__init__'  s    !|F,=v?Q_e_rsss#%<0NPVPb#c#c %'\&2H&J\%]%]" f&8f>STTTz&"<== ".#%<*F,>DL\$
 $
 $
   r&   Nc                 :   |E|.t          || j                                      |j                  }n|                     |          }||                                }n|                                d d         }|+t          j        |t          j        | j	        j                  }|| 
                    |          }|                     |          }|                     |          }||z   |z   }|                     |          }|                     |          }|S )Ndtypedevice)"create_position_ids_from_input_idsrJ   tore   &create_position_ids_from_inputs_embedssizer!   zeroslongposition_idsrS   rU   rW   rX   r\   )	r^   	input_idstoken_type_idsrl   inputs_embedsinput_shaperU   rW   
embeddingss	            r'   forwardzLukeEmbeddings.forward8  s    $A)TM]^^aabkbrss#JJ=YY #..**KK',,..ss3K!"[EJtO`OghhhN  00;;M"66|DD $ : :> J J"%88;PP
^^J//
\\*--
r&   c                    |                                 dd         }|d         }t          j        | j        dz   || j        z   dz   t          j        |j                  }|                    d                              |          S )z
        We are provided embeddings directly. We cannot infer which are padded so just generate sequential position ids.

        Args:
            inputs_embeds: torch.Tensor

        Returns: torch.Tensor
        Nrb   r   rc   r   )ri   r!   arangerJ   rk   re   	unsqueezeexpand)r^   ro   rp   sequence_lengthrl   s        r'   rh   z5LukeEmbeddings.create_position_ids_from_inputs_embedsY  s     $((**3B3/%a.|q /D4D"Dq"HPUPZcpcw
 
 
 %%a((//<<<r&   )NNNN)r   r   r   r    rN   rr   rh   __classcell__r`   s   @r'   rG   rG   "  st         
 
 
 
 
&    B= = = = = = =r&   rG   c                   f     e Zd Zdef fdZ	 ddej        dej        deej                 fdZ xZ	S )	LukeEntityEmbeddingsr_   c                 $   t                                                       || _        t          j        |j        |j        d          | _        |j        |j        k    r&t          j	        |j        |j        d          | _
        t          j        |j        |j                  | _        t          j        |j        |j                  | _        t          j        |j        |j                  | _        t          j        |j                  | _        d S )Nr   rI   FbiasrK   )rM   rN   r_   r   rO   entity_vocab_sizeentity_emb_sizeentity_embeddingsrQ   Linearentity_embedding_denserT   rU   rV   rW   rX   rY   rZ   r[   r\   r]   s     r'   rN   zLukeEntityEmbeddings.__init__l  s    !#f.FH^lm!n!n!n!V%777*,)F4JFL^ej*k*k*kD'#%<0NPVPb#c#c %'\&2H&J\%]%]"f&8f>STTTz&"<==r&   N
entity_idsrl   rn   c                    |t          j        |          }|                     |          }| j        j        | j        j        k    r|                     |          }|                     |                    d                    }|dk    	                    |          
                    d          }||z  }t          j        |d          }||                    d                              d          z  }|                     |          }||z   |z   }|                     |          }|                     |          }|S )Nr   )minrb   dimgHz>)r!   
zeros_liker   r_   r   rQ   r   rU   clamptype_asru   sumrW   rX   r\   )	r^   r   rl   rn   r   rU   position_embedding_maskrW   rq   s	            r'   rr   zLukeEntityEmbeddings.forwardz  sE    !"-j99N 22:>>;&$+*AAA $ ; ;<M N N"66|7I7Ia7I7P7PQQ#/2#5">">?R"S"S"]"]^`"a"a14KK#i(;DDD14K4O4OTV4O4W4W4]4]bf4]4g4gg $ : :> J J&)<<?TT
^^J//
\\*--
r&   N)
r   r   r   r   rN   r!   
LongTensorr   rr   rx   ry   s   @r'   r{   r{   k  s        >z > > > > > >$ 6:	 $ & !!12	       r&   r{   c                   2     e Zd Z fdZd Z	 	 	 ddZ xZS )LukeSelfAttentionc                 b   t                                                       |j        |j        z  dk    r0t	          |d          s t          d|j         d|j         d          |j        | _        t          |j        |j        z            | _        | j        | j        z  | _        |j	        | _	        t          j        |j        | j                  | _        t          j        |j        | j                  | _        t          j        |j        | j                  | _        | j	        rlt          j        |j        | j                  | _        t          j        |j        | j                  | _        t          j        |j        | j                  | _        t          j        |j                  | _        d S )Nr   embedding_sizezThe hidden size z4 is not a multiple of the number of attention heads .)rM   rN   rQ   num_attention_headshasattr
ValueErrorintattention_head_sizeall_head_sizeuse_entity_aware_attentionr   r   querykeyvalue	w2e_query	e2w_query	e2e_queryrZ   attention_probs_dropout_probr\   r]   s     r'   rN   zLukeSelfAttention.__init__  s{    ::a??PVXhHiHi?76#5 7 737 7 7  
 $*#= #&v'9F<V'V#W#W !58PP*0*K'Yv143EFF
9V/1CDDYv143EFF
* 	OYv'94;MNNDNYv'94;MNNDNYv'94;MNNDNz&"EFFr&   c                     |                                 d d         | j        | j        fz   } |j        | }|                    dddd          S )Nrb   r      r   r
   )ri   r   r   viewpermute)r^   xnew_x_shapes      r'   transpose_for_scoresz&LukeSelfAttention.transpose_for_scores  sM    ffhhssmt'?AY&ZZAFK yyAq!$$$r&   NFc                 ~   |                     d          }||}nt          j        ||gd          }|                     |                     |                    }|                     |                     |                    }	| j        r||                     |                     |                    }
|                     |                     |                    }|                     | 	                    |                    }|                     | 
                    |                    }|d d d d d |d d f         }|d d d d d |d d f         }|d d d d |d d d f         }|d d d d |d d d f         }t          j        |
|                    dd                    }t          j        ||                    dd                    }t          j        ||                    dd                    }t          j        ||                    dd                    }t          j        ||gd          }t          j        ||gd          }t          j        ||gd          }nQ|                     |                     |                    }t          j        ||                    dd                    }|t          j        | j                  z  }|||z   }t           j                            |d          }|                     |          }|||z  }t          j        ||	          }|                    dddd                                          }|                                 d d         | j        fz   } |j        | }|d d d |d d f         }|d }n|d d |d d d f         }|r|||f}n||f}|S )Nr   r   rb   r   r
   r   r   )ri   r!   catr   r   r   r   r   r   r   r   matmul	transposemathsqrtr   r   
functionalsoftmaxr\   r   
contiguousr   r   ) r^   word_hidden_statesr   attention_mask	head_maskoutput_attentions	word_sizeconcat_hidden_states	key_layervalue_layerw2w_query_layerw2e_query_layere2w_query_layere2e_query_layerw2w_key_layere2w_key_layerw2e_key_layere2e_key_layerw2w_attention_scoresw2e_attention_scorese2w_attention_scorese2e_attention_scoresword_attention_scoresentity_attention_scoresattention_scoresquery_layerattention_probscontext_layernew_context_layer_shapeoutput_word_hidden_statesoutput_entity_hidden_statesoutputss                                    r'   rr   zLukeSelfAttention.forward  s"    '++A..	'#5  #(9.@BV-W]^#_#_#_ --dhh7K.L.LMM	//

;O0P0PQQ* 	V/C/O #77

CU8V8VWWO"77GY8Z8Z[[O"77G[8\8\]]O"77G[8\8\]]O &aaaJYJ&9:M%aaaJYJ&9:M%aaaIJJ&9:M%aaaIJJ&9:M $)<AXAXY[]_A`A`#a#a #(<AXAXY[]_A`A`#a#a #(<AXAXY[]_A`A`#a#a #(<AXAXY[]_A`A`#a#a  %*I/CEY.Z`a$b$b$b!&+i1EG[0\bc&d&d&d#$y*?AX)Y_`aaa 33DJJ?S4T4TUUK$|K9L9LRQS9T9TUU+di8P.Q.QQ%/.@ -//0@b/II ,,77  -	9O_kBB%--aAq99DDFF"/"4"4"6"6ss";t?Q>S"S**,CD$1!!!ZiZ2B$C!'*.''*79::qqq8H*I' 	O02M_GG02MNGr&   NNF)r   r   r   rN   r   rr   rx   ry   s   @r'   r   r     ss        G G G G G0% % % P P P P P P P Pr&   r   c                   P     e Zd Z fdZdej        dej        dej        fdZ xZS )LukeSelfOutputc                    t                                                       t          j        |j        |j                  | _        t          j        |j        |j                  | _        t          j        |j	                  | _
        d S NrK   )rM   rN   r   r   rQ   denserX   rY   rZ   r[   r\   r]   s     r'   rN   zLukeSelfOutput.__init__	  sf    Yv163EFF
f&8f>STTTz&"<==r&   r1   input_tensorreturnc                     |                      |          }|                     |          }|                     ||z             }|S r   r   r\   rX   r^   r1   r   s      r'   rr   zLukeSelfOutput.forward  @    

=11]33}|'CDDr&   r   r   r   rN   r!   Tensorrr   rx   ry   s   @r'   r   r     i        > > > > >U\  RWR^        r&   r   c                   2     e Zd Z fdZd Z	 	 	 ddZ xZS )LukeAttentionc                     t                                                       t          |          | _        t	          |          | _        t                      | _        d S r   )rM   rN   r   r^   r   outputsetpruned_headsr]   s     r'   rN   zLukeAttention.__init__  sI    %f--	$V,,EEr&   c                      t          d          Nz4LUKE does not support the pruning of attention headsNotImplementedError)r^   headss     r'   prune_headszLukeAttention.prune_heads      !"XYYYr&   NFc                    |                     d          }|                     |||||          }||d         }|}	n6t          j        |d d         d          }t          j        ||gd          }	|                     ||	          }
|
d d d |d d f         }|d }n|
d d |d d d f         }||f|dd          z   }|S )Nr   r   r   r   )ri   r^   r!   r   r   )r^   r   r   r   r   r   r   self_outputsconcat_self_outputsr   attention_outputword_attention_outputentity_attention_outputr   s                 r'   rr   zLukeAttention.forward   s    '++A..	yy 
 
  '".q/#5  "')L!,<!"D"D"D#(9.@BV-W]^#_#_#_ ;;':<PQQ 0JYJ1A B'&*##&6qqq)**aaa7G&H# )*AB\RSRTRTEUUr&   r   )r   r   r   rN   r   rr   rx   ry   s   @r'   r   r     si        " " " " "Z Z Z " " " " " " " "r&   r   c                   B     e Zd Z fdZdej        dej        fdZ xZS )LukeIntermediatec                    t                                                       t          j        |j        |j                  | _        t          |j        t                    rt          |j                 | _        d S |j        | _        d S r   )rM   rN   r   r   rQ   intermediate_sizer   
isinstance
hidden_actstrr   intermediate_act_fnr]   s     r'   rN   zLukeIntermediate.__init__G  sn    Yv163KLL
f'-- 	9'-f.?'@D$$$'-'8D$$$r&   r1   r   c                 Z    |                      |          }|                     |          }|S r   )r   r   r^   r1   s     r'   rr   zLukeIntermediate.forwardO  s,    

=1100??r&   r   ry   s   @r'   r   r   F  s^        9 9 9 9 9U\ el        r&   r   c                   P     e Zd Z fdZdej        dej        dej        fdZ xZS )
LukeOutputc                    t                                                       t          j        |j        |j                  | _        t          j        |j        |j                  | _        t          j	        |j
                  | _        d S r   )rM   rN   r   r   r   rQ   r   rX   rY   rZ   r[   r\   r]   s     r'   rN   zLukeOutput.__init__W  sf    Yv79KLL
f&8f>STTTz&"<==r&   r1   r   r   c                     |                      |          }|                     |          }|                     ||z             }|S r   r   r   s      r'   rr   zLukeOutput.forward]  r   r&   r   ry   s   @r'   r   r   V  r   r&   r   c                   2     e Zd Z fdZ	 	 	 ddZd Z xZS )	LukeLayerc                     t                                                       |j        | _        d| _        t	          |          | _        t          |          | _        t          |          | _	        d S Nr   )
rM   rN   chunk_size_feed_forwardseq_len_dimr   	attentionr   intermediater   r   r]   s     r'   rN   zLukeLayer.__init__e  s^    '-'E$&v..,V44 ((r&   NFc                 h   |                     d          }|                     |||||          }|	|d         }nt          j        |d d         d          }|dd          }	t	          | j        | j        | j        |          }
|
d d d |d d f         }|d }n|
d d |d d d f         }||f|	z   }	|	S )Nr   )r   r   r   r   )ri   r  r!   r   r   feed_forward_chunkr  r  )r^   r   r   r   r   r   r   self_attention_outputsconcat_attention_outputr   layer_outputword_layer_outputentity_layer_outputs                r'   rr   zLukeLayer.forwardm  s    '++A..	!% / "0 "
 "
  '&<Q&?##&+i0Frr0JPQ&R&R&R#(,0#T%A4CSUl
 
 )JYJ)9:'"&".qqq)**aaa/?"@$&9:WDr&   c                 \    |                      |          }|                     ||          }|S r   )r	  r   )r^   r   intermediate_outputr  s       r'   r  zLukeLayer.feed_forward_chunk  s2    "//0@AA{{#68HIIr&   r   )r   r   r   rN   rr   r  rx   ry   s   @r'   r  r  d  sg        ) ) ) ) ) # # # #J      r&   r  c                   0     e Zd Z fdZ	 	 	 	 	 ddZ xZS )LukeEncoderc                     t                                                       | _        t          j        fdt          j                  D                       | _        d| _        d S )Nc                 .    g | ]}t                    S r%   )r  ).0_r_   s     r'   
<listcomp>z(LukeEncoder.__init__.<locals>.<listcomp>  s!    #_#_#_!If$5$5#_#_#_r&   F)	rM   rN   r_   r   
ModuleListrangenum_hidden_layerslayergradient_checkpointingr]   s    `r'   rN   zLukeEncoder.__init__  s`    ]#_#_#_#_uVE]?^?^#_#_#_``
&+###r&   NFTc                 p   |rdnd }|rdnd }	|rdnd }
t          | j                  D ]N\  }}|r||fz   }|	|fz   }	|||         nd } ||||||          }|d         }||d         }|r|
|d         fz   }
O|r||fz   }|	|fz   }	|st          d |||
||	fD                       S t          |||
||	          S )Nr%   r   r   r   c              3      K   | ]}||V  	d S r   r%   r  vs     r'   	<genexpr>z&LukeEncoder.forward.<locals>.<genexpr>  4       
 
 =  !===
 
r&   )last_hidden_stater1   r2   r   r   )	enumerater  r$   r)   )r^   r   r   r   r   r   output_hidden_statesreturn_dictall_word_hidden_statesall_entity_hidden_statesall_self_attentionsilayer_modulelayer_head_masklayer_outputss                  r'   rr   zLukeEncoder.forward  s|    (<!E)=#G224 $5?bb4(44 	P 	POA|# ^)?CUBW)W&+CG[F]+](.7.CillO(L"$! M "/q!1#/'4Q'7$  P&9]1=M<O&O# 	Z%;?Q>S%S"'?CWBY'Y$ 	 
 
 '*'(,
 
 
 
 
 
 #00*%9!9
 
 
 	
r&   )NNFFTr   r   r   rN   rr   rx   ry   s   @r'   r  r    s]        , , , , , ":
 :
 :
 :
 :
 :
 :
 :
r&   r  c                   B     e Zd Z fdZdej        dej        fdZ xZS )
LukePoolerc                     t                                                       t          j        |j        |j                  | _        t          j                    | _        d S r   )rM   rN   r   r   rQ   r   Tanh
activationr]   s     r'   rN   zLukePooler.__init__  sC    Yv163EFF
'))r&   r1   r   c                 r    |d d df         }|                      |          }|                     |          }|S )Nr   )r   r5  )r^   r1   first_token_tensorpooled_outputs       r'   rr   zLukePooler.forward  s@     +111a40

#56666r&   r   ry   s   @r'   r2  r2    s^        $ $ $ $ $
U\ el        r&   r2  c                   $     e Zd Z fdZd Z xZS )EntityPredictionHeadTransformc                 V   t                                                       t          j        |j        |j                  | _        t          |j        t                    rt          |j                 | _        n|j        | _        t          j        |j        |j                  | _        d S r   )rM   rN   r   r   rQ   r   r   r   r   r   r   transform_act_fnrX   rY   r]   s     r'   rN   z&EntityPredictionHeadTransform.__init__  s    Yv163IJJ
f'-- 	6$*6+<$=D!!$*$5D!f&<&BWXXXr&   c                     |                      |          }|                     |          }|                     |          }|S r   )r   r<  rX   r   s     r'   rr   z%EntityPredictionHeadTransform.forward  s=    

=11--m<<}55r&   r0  ry   s   @r'   r:  r:    sL        Y Y Y Y Y      r&   r:  c                   $     e Zd Z fdZd Z xZS )EntityPredictionHeadc                 *   t                                                       || _        t          |          | _        t          j        |j        |j        d          | _	        t          j
        t          j        |j                            | _        d S )NFr}   )rM   rN   r_   r:  	transformr   r   r   r   decoder	Parameterr!   rj   r~   r]   s     r'   rN   zEntityPredictionHead.__init__  sp    6v>>y!79QX]^^^LV-E!F!FGG			r&   c                 j    |                      |          }|                     |          | j        z   }|S r   )rA  rB  r~   r   s     r'   rr   zEntityPredictionHead.forward  s1    }55]33di?r&   r0  ry   s   @r'   r?  r?    sL        H H H H H      r&   r?  c                   @    e Zd ZU eed<   dZdZddgZdej	        fdZ
dS )	LukePreTrainedModelr_   lukeTr   r{   modulec                    t          |t          j                  rT|j        j                            d| j        j                   |j         |j        j        	                                 dS dS t          |t          j
                  r|j        dk    r|j        j        	                                 n+|j        j                            d| j        j                   |j        +|j        j        |j                 	                                 dS dS t          |t          j                  r?|j        j        	                                 |j        j                            d           dS dS )zInitialize the weightsg        )meanstdNr         ?)r   r   r   weightdatanormal_r_   initializer_ranger~   zero_rO   embedding_dimrJ   rX   fill_)r^   rH  s     r'   _init_weightsz!LukePreTrainedModel._init_weights  sO   fbi(( 	*M&&CT[5R&SSS{& &&((((( '&-- 		*#q(("((****"**9V*WWW!-"6#56<<>>>>> .--- 	*K""$$$M$$S)))))	* 	*r&   N)r   r   r   r   r#   base_model_prefixsupports_gradient_checkpointing_no_split_modulesr   ModulerT  r%   r&   r'   rF  rF    sW         &*#(*@A*BI * * * * * *r&   rF  zt
    The bare LUKE model transformer outputting raw hidden-states for both word tokens and entities without any
    c                        e Zd Zddedef fdZd Zd Zd Zd Z	d	 Z
e	 	 	 	 	 	 	 	 	 	 	 	 	 ddeej                 deej                 deej                 deej                 deej                 deej                 deej                 deej                 deej                 deej                 dee         dee         dee         deeef         fd            Zdej        deej                 fdZ xZS )	LukeModelTr_   add_pooling_layerc                 (   t                                          |           || _        t          |          | _        t          |          | _        t          |          | _        |rt          |          nd| _
        |                                  dS )zv
        add_pooling_layer (bool, *optional*, defaults to `True`):
            Whether to add a pooling layer
        N)rM   rN   r_   rG   rq   r{   r   r  encoderr2  pooler	post_init)r^   r_   r[  r`   s      r'   rN   zLukeModel.__init__+  s    
 	   (00!5f!=!="6**,=Gj(((4 	r&   c                     | j         j        S r   rq   rS   r^   s    r'   get_input_embeddingszLukeModel.get_input_embeddings<  s    ..r&   c                     || j         _        d S r   ra  r^   r   s     r'   set_input_embeddingszLukeModel.set_input_embeddings?  s    */'''r&   c                     | j         j         S r   r   rb  s    r'   get_entity_embeddingszLukeModel.get_entity_embeddingsB  s    %77r&   c                     || j         _         d S r   rh  re  s     r'   set_entity_embeddingszLukeModel.set_entity_embeddingsE  s    38000r&   c                      t          d          r   r   )r^   heads_to_prunes     r'   _prune_headszLukeModel._prune_headsH  r   r&   Nrm   r   rn   rl   r   entity_attention_maskentity_token_type_idsentity_position_idsr   ro   r   r'  r(  r   c           	      \   ||n| j         j        }||n| j         j        }||n| j         j        }||
t	          d          |+|                     ||           |                                }n.|
|
                                dd         }nt	          d          |\  }}||j        n|
j        }|t          j	        ||f|          }|!t          j
        |t          j        |          }|T|                    d          }|t          j	        ||f|          }|#t          j
        ||ft          j        |          }|                     |	| j         j                  }	|                     ||||
          }|                     ||          }|d}n|                     |||          }|                     ||||	|||	          }|d
         }| j        |                     |          nd}|s||f|dd         z   S t'          |||j        |j        |j        |j                  S )uz  
        entity_ids (`torch.LongTensor` of shape `(batch_size, entity_length)`):
            Indices of entity tokens in the entity vocabulary.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.
        entity_attention_mask (`torch.FloatTensor` of shape `(batch_size, entity_length)`, *optional*):
            Mask to avoid performing attention on padding entity token indices. Mask values selected in `[0, 1]`:

            - 1 for entity tokens that are **not masked**,
            - 0 for entity tokens that are **masked**.
        entity_token_type_ids (`torch.LongTensor` of shape `(batch_size, entity_length)`, *optional*):
            Segment token indices to indicate first and second portions of the entity token inputs. Indices are
            selected in `[0, 1]`:

            - 0 corresponds to a *portion A* entity token,
            - 1 corresponds to a *portion B* entity token.
        entity_position_ids (`torch.LongTensor` of shape `(batch_size, entity_length, max_mention_length)`, *optional*):
            Indices of positions of each input entity in the position embeddings. Selected in the range `[0,
            config.max_position_embeddings - 1]`.

        Examples:

        ```python
        >>> from transformers import AutoTokenizer, LukeModel

        >>> tokenizer = AutoTokenizer.from_pretrained("studio-ousia/luke-base")
        >>> model = LukeModel.from_pretrained("studio-ousia/luke-base")
        # Compute the contextualized entity representation corresponding to the entity mention "Beyoncé"

        >>> text = "Beyoncé lives in Los Angeles."
        >>> entity_spans = [(0, 7)]  # character-based entity span corresponding to "Beyoncé"

        >>> encoding = tokenizer(text, entity_spans=entity_spans, add_prefix_space=True, return_tensors="pt")
        >>> outputs = model(**encoding)
        >>> word_last_hidden_state = outputs.last_hidden_state
        >>> entity_last_hidden_state = outputs.entity_last_hidden_state
        # Input Wikipedia entities to obtain enriched contextualized representations of word tokens

        >>> text = "Beyoncé lives in Los Angeles."
        >>> entities = [
        ...     "Beyoncé",
        ...     "Los Angeles",
        ... ]  # Wikipedia entity titles corresponding to the entity mentions "Beyoncé" and "Los Angeles"
        >>> entity_spans = [
        ...     (0, 7),
        ...     (17, 28),
        ... ]  # character-based entity spans corresponding to "Beyoncé" and "Los Angeles"

        >>> encoding = tokenizer(
        ...     text, entities=entities, entity_spans=entity_spans, add_prefix_space=True, return_tensors="pt"
        ... )
        >>> outputs = model(**encoding)
        >>> word_last_hidden_state = outputs.last_hidden_state
        >>> entity_last_hidden_state = outputs.entity_last_hidden_state
        ```NzDYou cannot specify both input_ids and inputs_embeds at the same timerb   z5You have to specify either input_ids or inputs_embeds)re   rc   r   )rm   rl   rn   ro   )r   r   r   r'  r(  r   )r%  pooler_outputr1   r2   r   r   )r_   r   r'  use_return_dictr   %warn_if_padding_and_no_attention_maskri   re   r!   onesrj   rk   get_head_maskr  rq   get_extended_attention_maskr   r]  r^  r   r1   r2   r   r   )r^   rm   r   rn   rl   r   ro  rp  rq  r   ro   r   r'  r(  rp   
batch_size
seq_lengthre   entity_seq_lengthword_embedding_outputextended_attention_maskentity_embedding_outputencoder_outputssequence_outputr8  s                            r'   rr   zLukeModel.forwardK  s   R 2C1N--TXT_Tq$8$D  $+Jj 	 &1%<kk$+B] ]%>cddd"66y.QQQ#..**KK&',,..ss3KKTUUU!,
J%.%:!!@T!"ZZ(@PPPN!"[EJvVVVN! * 2 2$,(-
J@Q3R[a(b(b(b%$,(-ZAR4S[`[ent(u(u(u% &&y$+2OPP	 !%%)'	 !0 !
 !
 #'"B"B>Sh"i"i &*##&*&<&<ZI\^s&t&t# ,,!#2/!5# ' 
 
 *!, 9=8OO444UY 	J#]3oabb6III--')7&1%4%M!0!E
 
 
 	
r&   word_attention_maskc                    |}|t          j        ||gd          }|                                dk    r|dddddddf         }nA|                                dk    r|ddddddf         }nt          d|j         d          |                    | j                  }d	|z
  t          j        | j                  j        z  }|S )
ac  
        Makes broadcastable attention and causal masks so that future and masked tokens are ignored.

        Arguments:
            word_attention_mask (`torch.LongTensor`):
                Attention mask for word tokens with ones indicating tokens to attend to, zeros for tokens to ignore.
            entity_attention_mask (`torch.LongTensor`, *optional*):
                Attention mask for entity tokens with ones indicating tokens to attend to, zeros for tokens to ignore.

        Returns:
            `torch.Tensor` The extended attention mask, with a the same dtype as `attention_mask.dtype`.
        Nrb   r   r
   r   z&Wrong shape for attention_mask (shape ))rd   rL  )	r!   r   r   r   shaperg   rd   finfor   )r^   r  ro  r   r}  s        r'   rx  z%LukeModel.get_extended_attention_mask  s     - ,"Y8M'NTVWWWN1$$&4QQQaaa]&C##!!Q&&&4QQQdAAA5E&F##]nFZ]]]^^^"9"<"<4:"<"N"N#&)@#@EKPTPZD[D[D_"_&&r&   )T)NNNNNNNNNNNNN)r   r   r   r   boolrN   rc  rf  ri  rk  rn  r   r   r!   r   r"   r   r$   r   rr   rx  rx   ry   s   @r'   rZ  rZ  %  s!        z d      "/ / /0 0 08 8 89 9 9Z Z Z  156:593715=A<@:>1559,0/3&*Y
 Y
E,-Y
 !!23Y
 !!12	Y

 u/0Y
 U-.Y
  ((9:Y
  ((89Y
 &e&67Y
 E-.Y
   12Y
 $D>Y
 'tnY
 d^Y
 
u44	5Y
 Y
 Y
 ^Y
v'#(#3'LTUZUeLf' ' ' ' ' ' ' 'r&   rZ  c                     |                      |                                          }t          j        |d                              |          |z  }|                                |z   S )a  
    Replace non-padding symbols with their position numbers. Position numbers begin at padding_idx+1. Padding symbols
    are ignored. This is modified from fairseq's `utils.make_positions`.

    Args:
        x: torch.Tensor x:

    Returns: torch.Tensor
    r   r   )ner   r!   cumsumr   rk   )rm   rJ   maskincremental_indicess       r'   rf   rf     s`     <<$$((**D <!444<<TBBdJ##%%33r&   c                   .     e Zd ZdZ fdZd Zd Z xZS )
LukeLMHeadz*Roberta Head for masked language modeling.c                    t                                                       t          j        |j        |j                  | _        t          j        |j        |j                  | _        t          j        |j        |j	                  | _
        t          j        t          j        |j	                            | _        | j        | j
        _        d S r   )rM   rN   r   r   rQ   r   rX   rY   
layer_normrP   rB  rC  r!   rj   r~   r]   s     r'   rN   zLukeLMHead.__init__  s    Yv163EFF
,v'9v?TUUUy!3V5FGGLV->!?!?@@	 Ir&   c                     |                      |          }t          |          }|                     |          }|                     |          }|S r   )r   r   r  rB  )r^   featureskwargsr   s       r'   rr   zLukeLMHead.forward#  sE    JJx  GGOOA LLOOr&   c                     | j         j        j        j        dk    r| j        | j         _        d S | j         j        | _        d S )Nmeta)rB  r~   re   typerb  s    r'   _tie_weightszLukeLMHead._tie_weights-  s<     <#(F22 $	DL)DIIIr&   )r   r   r   r    rN   rr   r  rx   ry   s   @r'   r  r    s\        44& & & & &  * * * * * * *r&   r  z
    The LUKE model with a language modeling head and entity prediction head on top for masked language modeling and
    masked entity prediction.
    c            $           e Zd Zg dZ fdZ fdZd Zd Ze	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 dde	e
j                 de	e
j                 d	e	e
j                 d
e	e
j                 de	e
j                 de	e
j                 de	e
j                 de	e
j                 de	e
j                 de	e
j                 de	e
j                 de	e
j                 de	e         de	e         de	e         deeef         f d            Z xZS )LukeForMaskedLM)zlm_head.decoder.weightzlm_head.decoder.biasz!entity_predictions.decoder.weightc                    t                                          |           t          |          | _        t	          |          | _        t          |          | _        t          j	                    | _
        |                                  d S r   )rM   rN   rZ  rG  r  lm_headr?  entity_predictionsr   r   loss_fnr_  r]   s     r'   rN   zLukeForMaskedLM.__init__?  sq       f%%	!&))"6v">">*,, 	r&   c                     t                                                       |                     | j        j        | j        j        j                   d S r   )rM   tie_weights_tie_or_clone_weightsr  rB  rG  r   )r^   r`   s    r'   r  zLukeForMaskedLM.tie_weightsL  sC    ""4#:#BDID_Dqrrrrrr&   c                     | j         j        S r   r  rB  rb  s    r'   get_output_embeddingsz%LukeForMaskedLM.get_output_embeddingsP  s    |##r&   c                     || j         _        d S r   r  )r^   new_embeddingss     r'   set_output_embeddingsz%LukeForMaskedLM.set_output_embeddingsS  s    -r&   Nrm   r   rn   rl   r   ro  rp  rq  labelsentity_labelsr   ro   r   r'  r(  r   c                    ||n| j         j        }|                     ||||||||||||d          }d}d}|                     |j                  }|	e|	                    |j                  }	|                     |                    d| j         j	                  |	                    d                    }||}d}d}|j
        m|                     |j
                  }|
Q|                     |                    d| j         j                  |
                    d                    }||}n||z   }|s0t          d ||||||j        |j        |j        fD                       S t#          ||||||j        |j        |j                  S )aC  
        entity_ids (`torch.LongTensor` of shape `(batch_size, entity_length)`):
            Indices of entity tokens in the entity vocabulary.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.
        entity_attention_mask (`torch.FloatTensor` of shape `(batch_size, entity_length)`, *optional*):
            Mask to avoid performing attention on padding entity token indices. Mask values selected in `[0, 1]`:

            - 1 for entity tokens that are **not masked**,
            - 0 for entity tokens that are **masked**.
        entity_token_type_ids (`torch.LongTensor` of shape `(batch_size, entity_length)`, *optional*):
            Segment token indices to indicate first and second portions of the entity token inputs. Indices are
            selected in `[0, 1]`:

            - 0 corresponds to a *portion A* entity token,
            - 1 corresponds to a *portion B* entity token.
        entity_position_ids (`torch.LongTensor` of shape `(batch_size, entity_length, max_mention_length)`, *optional*):
            Indices of positions of each input entity in the position embeddings. Selected in the range `[0,
            config.max_position_embeddings - 1]`.
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
            config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
            loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
        entity_labels (`torch.LongTensor` of shape `(batch_size, entity_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
            config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
            loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
        NTrm   r   rn   rl   r   ro  rp  rq  r   ro   r   r'  r(  rb   c              3      K   | ]}||V  	d S r   r%   r!  s     r'   r#  z*LukeForMaskedLM.forward.<locals>.<genexpr>  s4         =  !=== r&   )r,   r-   r.   r/   r0   r1   r   r2   )r_   rt  rG  r  r%  rg   re   r  r   rP   r   r  r   r$   r1   r   r2   r+   )r^   rm   r   rn   rl   r   ro  rp  rq  r  r  r   ro   r   r'  r(  r   r,   r-   r/   r.   r0   s                         r'   rr   zLukeForMaskedLM.forwardV  s   b &1%<kk$+B]))))%!"7"7 3'/!5  
 
  g788YYv}--F||FKKDK4J$K$KV[[Y[__]]H|+7 33G4TUUM(<<(:(:2t{?\(](]_l_q_qrt_u_uvv<#DD(?D 	   !)0&	      "'!/!(!=)	
 	
 	
 		
r&   NNNNNNNNNNNNNNN)r   r   r   _tied_weights_keysrN   r  r  r  r   r   r!   r   r"   r  r   r$   r+   rr   rx   ry   s   @r'   r  r  6  s        qpp    s s s s s$ $ $. . .  156:593715<@<@:>-1481559,0/3&*!q
 q
E,-q
 !!23q
 !!12	q

 u/0q
 U-.q
  ((89q
  ((89q
 &e&67q
 )*q
   01q
 E-.q
   12q
 $D>q
 'tnq
  d^!q
" 
u((	)#q
 q
 q
 ^q
 q
 q
 q
 q
r&   r  z
    The LUKE model with a classification head on top (a linear layer on top of the hidden state of the first entity
    token) for entity classification tasks, such as Open Entity.
    c            "           e Zd Z fdZe	 	 	 	 	 	 	 	 	 	 	 	 	 	 ddeej                 deej                 deej                 deej                 deej                 deej                 d	eej                 d
eej                 deej                 deej                 deej                 dee	         dee	         dee	         de
eef         fd            Z xZS )LukeForEntityClassificationc                 6   t                                          |           t          |          | _        |j        | _        t          j        |j                  | _        t          j	        |j
        |j                  | _        |                                  d S r   rM   rN   rZ  rG  
num_labelsr   rZ   r[   r\   r   rQ   
classifierr_  r]   s     r'   rN   z$LukeForEntityClassification.__init__  sy       f%%	 +z&"<==)F$68IJJ 	r&   Nrm   r   rn   rl   r   ro  rp  rq  r   ro   r  r   r'  r(  r   c                    ||n| j         j        }|                     |||||||||	|
||d          }|j        dddddf         }|                     |          }|                     |          }d}||                    |j                  }|j        dk    r!t          j
                            ||          }nYt          j
                            |                    d          |                    d                              |                    }|s-t          d |||j        |j        |j        fD                       S t'          |||j        |j        |j                  S )	u
  
        entity_ids (`torch.LongTensor` of shape `(batch_size, entity_length)`):
            Indices of entity tokens in the entity vocabulary.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.
        entity_attention_mask (`torch.FloatTensor` of shape `(batch_size, entity_length)`, *optional*):
            Mask to avoid performing attention on padding entity token indices. Mask values selected in `[0, 1]`:

            - 1 for entity tokens that are **not masked**,
            - 0 for entity tokens that are **masked**.
        entity_token_type_ids (`torch.LongTensor` of shape `(batch_size, entity_length)`, *optional*):
            Segment token indices to indicate first and second portions of the entity token inputs. Indices are
            selected in `[0, 1]`:

            - 0 corresponds to a *portion A* entity token,
            - 1 corresponds to a *portion B* entity token.
        entity_position_ids (`torch.LongTensor` of shape `(batch_size, entity_length, max_mention_length)`, *optional*):
            Indices of positions of each input entity in the position embeddings. Selected in the range `[0,
            config.max_position_embeddings - 1]`.
        labels (`torch.LongTensor` of shape `(batch_size,)` or `(batch_size, num_labels)`, *optional*):
            Labels for computing the classification loss. If the shape is `(batch_size,)`, the cross entropy loss is
            used for the single-label classification. In this case, labels should contain the indices that should be in
            `[0, ..., config.num_labels - 1]`. If the shape is `(batch_size, num_labels)`, the binary cross entropy
            loss is used for the multi-label classification. In this case, labels should only contain `[0, 1]`, where 0
            and 1 indicate false and true, respectively.

        Examples:

        ```python
        >>> from transformers import AutoTokenizer, LukeForEntityClassification

        >>> tokenizer = AutoTokenizer.from_pretrained("studio-ousia/luke-large-finetuned-open-entity")
        >>> model = LukeForEntityClassification.from_pretrained("studio-ousia/luke-large-finetuned-open-entity")

        >>> text = "Beyoncé lives in Los Angeles."
        >>> entity_spans = [(0, 7)]  # character-based entity span corresponding to "Beyoncé"
        >>> inputs = tokenizer(text, entity_spans=entity_spans, return_tensors="pt")
        >>> outputs = model(**inputs)
        >>> logits = outputs.logits
        >>> predicted_class_idx = logits.argmax(-1).item()
        >>> print("Predicted class:", model.config.id2label[predicted_class_idx])
        Predicted class: person
        ```NTr  r   r   rb   c              3      K   | ]}||V  	d S r   r%   r!  s     r'   r#  z6LukeForEntityClassification.forward.<locals>.<genexpr>>  0        =  === r&   r,   r/   r1   r   r2   )r_   rt  rG  r   r\   r  rg   re   ndimr   r   cross_entropy binary_cross_entropy_with_logitsr   r   r$   r1   r   r2   r4   r^   rm   r   rn   rl   r   ro  rp  rq  r   ro   r  r   r'  r(  r   feature_vectorr/   r,   s                      r'   rr   z#LukeForEntityClassification.forward  s   | &1%<kk$+B]))))%!"7"7 3'/!5  
 
  !9!!!Q'Bn5500 YYv}--F{a}2266BB}EEfkkRTooW]WbWbceWfWfWnWnouWvWvww 	  (=w?[]d]op      *!/!(!=)
 
 
 	
r&   NNNNNNNNNNNNNN)r   r   r   rN   r   r   r!   r   r"   r  r   r$   r4   rr   rx   ry   s   @r'   r  r    s       
 
 
 
 
  156:593715=A<@:>1559.2,0/3&*k
 k
E,-k
 !!23k
 !!12	k

 u/0k
 U-.k
  ((9:k
  ((89k
 &e&67k
 E-.k
   12k
 *+k
 $D>k
 'tnk
 d^k
  
u00	1!k
 k
 k
 ^k
 k
 k
 k
 k
r&   r  z
    The LUKE model with a classification head on top (a linear layer on top of the hidden states of the two entity
    tokens) for entity pair classification tasks, such as TACRED.
    c            "           e Zd Z fdZe	 	 	 	 	 	 	 	 	 	 	 	 	 	 ddeej                 deej                 deej                 deej                 deej                 deej                 d	eej                 d
eej                 deej                 deej                 deej                 dee	         dee	         dee	         de
eef         fd            Z xZS )LukeForEntityPairClassificationc                 >   t                                          |           t          |          | _        |j        | _        t          j        |j                  | _        t          j	        |j
        dz  |j        d          | _        |                                  d S )Nr   Fr  r]   s     r'   rN   z(LukeForEntityPairClassification.__init__T  s       f%%	 +z&"<==)F$6$:F<MuUU 	r&   Nrm   r   rn   rl   r   ro  rp  rq  r   ro   r  r   r'  r(  r   c                    ||n| j         j        }|                     |||||||||	|
||d          }t          j        |j        dddddf         |j        dddddf         gd          }|                     |          }|                     |          }d}||                    |j	                  }|j
        dk    r!t          j                            ||          }nYt          j                            |                    d          |                    d                              |                    }|s-t#          d |||j        |j        |j        fD                       S t+          |||j        |j        |j        	          S )
u  
        entity_ids (`torch.LongTensor` of shape `(batch_size, entity_length)`):
            Indices of entity tokens in the entity vocabulary.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.
        entity_attention_mask (`torch.FloatTensor` of shape `(batch_size, entity_length)`, *optional*):
            Mask to avoid performing attention on padding entity token indices. Mask values selected in `[0, 1]`:

            - 1 for entity tokens that are **not masked**,
            - 0 for entity tokens that are **masked**.
        entity_token_type_ids (`torch.LongTensor` of shape `(batch_size, entity_length)`, *optional*):
            Segment token indices to indicate first and second portions of the entity token inputs. Indices are
            selected in `[0, 1]`:

            - 0 corresponds to a *portion A* entity token,
            - 1 corresponds to a *portion B* entity token.
        entity_position_ids (`torch.LongTensor` of shape `(batch_size, entity_length, max_mention_length)`, *optional*):
            Indices of positions of each input entity in the position embeddings. Selected in the range `[0,
            config.max_position_embeddings - 1]`.
        labels (`torch.LongTensor` of shape `(batch_size,)` or `(batch_size, num_labels)`, *optional*):
            Labels for computing the classification loss. If the shape is `(batch_size,)`, the cross entropy loss is
            used for the single-label classification. In this case, labels should contain the indices that should be in
            `[0, ..., config.num_labels - 1]`. If the shape is `(batch_size, num_labels)`, the binary cross entropy
            loss is used for the multi-label classification. In this case, labels should only contain `[0, 1]`, where 0
            and 1 indicate false and true, respectively.

        Examples:

        ```python
        >>> from transformers import AutoTokenizer, LukeForEntityPairClassification

        >>> tokenizer = AutoTokenizer.from_pretrained("studio-ousia/luke-large-finetuned-tacred")
        >>> model = LukeForEntityPairClassification.from_pretrained("studio-ousia/luke-large-finetuned-tacred")

        >>> text = "Beyoncé lives in Los Angeles."
        >>> entity_spans = [
        ...     (0, 7),
        ...     (17, 28),
        ... ]  # character-based entity spans corresponding to "Beyoncé" and "Los Angeles"
        >>> inputs = tokenizer(text, entity_spans=entity_spans, return_tensors="pt")
        >>> outputs = model(**inputs)
        >>> logits = outputs.logits
        >>> predicted_class_idx = logits.argmax(-1).item()
        >>> print("Predicted class:", model.config.id2label[predicted_class_idx])
        Predicted class: per:cities_of_residence
        ```NTr  r   r   r   rb   c              3      K   | ]}||V  	d S r   r%   r!  s     r'   r#  z:LukeForEntityPairClassification.forward.<locals>.<genexpr>  r  r&   r  )r_   rt  rG  r!   r   r   r\   r  rg   re   r  r   r   r  r  r   r   r$   r1   r   r2   r9   r  s                      r'   rr   z'LukeForEntityPairClassification.forward`  s   B &1%<kk$+B]))))%!"7"7 3'/!5  
 
  -aaaAAAg68XYZYZYZ\]_`_`_`Y`8abhi
 
 
 n5500 YYv}--F{a}2266BB}EEfkkRTooW]WbWbceWfWfWnWnouWvWvww 	  (=w?[]d]op      .!/!(!=)
 
 
 	
r&   r  )r   r   r   rN   r   r   r!   r   r"   r  r   r$   r9   rr   rx   ry   s   @r'   r  r  M  s       
 
 
 
 
  156:593715=A<@:>1559-1,0/3&*p
 p
E,-p
 !!23p
 !!12	p

 u/0p
 U-.p
  ((9:p
  ((89p
 &e&67p
 E-.p
   12p
 )*p
 $D>p
 'tnp
 d^p
  
u44	5!p
 p
 p
 ^p
 p
 p
 p
 p
r&   r  z
    The LUKE model with a span classification head on top (a linear layer on top of the hidden states output) for tasks
    such as named entity recognition.
    c            &           e Zd Z fdZe	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 ddeej                 deej                 deej                 deej                 deej                 deej                 d	eej                 d
eej                 deej                 deej                 deej                 deej                 deej                 dee	         dee	         dee	         de
eef         f"d            Z xZS )LukeForEntitySpanClassificationc                 <   t                                          |           t          |          | _        |j        | _        t          j        |j                  | _        t          j	        |j
        dz  |j                  | _        |                                  d S )Nr
   r  r]   s     r'   rN   z(LukeForEntitySpanClassification.__init__  s~       f%%	 +z&"<==)F$6$:F<MNN 	r&   Nrm   r   rn   rl   r   ro  rp  rq  entity_start_positionsentity_end_positionsr   ro   r  r   r'  r(  r   c                 P   ||n| j         j        }|                     ||||||||||||d          }|j                            d          }|	                    d                              dd|          }	|	j        |j        j        k    r|	                    |j        j                  }	t          j
        |j        d|	          }|
                    d                              dd|          }
|
j        |j        j        k    r|
                    |j        j                  }
t          j
        |j        d|
          }t          j        |||j        gd          }|                     |          }|                     |          }d}||                    |j                  }|j        dk    rMt           j                            |                    d| j                  |                    d                    }nYt           j                            |                    d          |                    d                              |                    }|s-t/          d |||j        |j        |j        fD                       S t7          |||j        |j        |j        	          S )
u  
        entity_ids (`torch.LongTensor` of shape `(batch_size, entity_length)`):
            Indices of entity tokens in the entity vocabulary.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.
        entity_attention_mask (`torch.FloatTensor` of shape `(batch_size, entity_length)`, *optional*):
            Mask to avoid performing attention on padding entity token indices. Mask values selected in `[0, 1]`:

            - 1 for entity tokens that are **not masked**,
            - 0 for entity tokens that are **masked**.
        entity_token_type_ids (`torch.LongTensor` of shape `(batch_size, entity_length)`, *optional*):
            Segment token indices to indicate first and second portions of the entity token inputs. Indices are
            selected in `[0, 1]`:

            - 0 corresponds to a *portion A* entity token,
            - 1 corresponds to a *portion B* entity token.
        entity_position_ids (`torch.LongTensor` of shape `(batch_size, entity_length, max_mention_length)`, *optional*):
            Indices of positions of each input entity in the position embeddings. Selected in the range `[0,
            config.max_position_embeddings - 1]`.
        entity_start_positions (`torch.LongTensor`):
            The start positions of entities in the word token sequence.
        entity_end_positions (`torch.LongTensor`):
            The end positions of entities in the word token sequence.
        labels (`torch.LongTensor` of shape `(batch_size, entity_length)` or `(batch_size, entity_length, num_labels)`, *optional*):
            Labels for computing the classification loss. If the shape is `(batch_size, entity_length)`, the cross
            entropy loss is used for the single-label classification. In this case, labels should contain the indices
            that should be in `[0, ..., config.num_labels - 1]`. If the shape is `(batch_size, entity_length,
            num_labels)`, the binary cross entropy loss is used for the multi-label classification. In this case,
            labels should only contain `[0, 1]`, where 0 and 1 indicate false and true, respectively.

        Examples:

        ```python
        >>> from transformers import AutoTokenizer, LukeForEntitySpanClassification

        >>> tokenizer = AutoTokenizer.from_pretrained("studio-ousia/luke-large-finetuned-conll-2003")
        >>> model = LukeForEntitySpanClassification.from_pretrained("studio-ousia/luke-large-finetuned-conll-2003")

        >>> text = "Beyoncé lives in Los Angeles"
        # List all possible entity spans in the text

        >>> word_start_positions = [0, 8, 14, 17, 21]  # character-based start positions of word tokens
        >>> word_end_positions = [7, 13, 16, 20, 28]  # character-based end positions of word tokens
        >>> entity_spans = []
        >>> for i, start_pos in enumerate(word_start_positions):
        ...     for end_pos in word_end_positions[i:]:
        ...         entity_spans.append((start_pos, end_pos))

        >>> inputs = tokenizer(text, entity_spans=entity_spans, return_tensors="pt")
        >>> outputs = model(**inputs)
        >>> logits = outputs.logits
        >>> predicted_class_indices = logits.argmax(-1).squeeze().tolist()
        >>> for span, predicted_class_idx in zip(entity_spans, predicted_class_indices):
        ...     if predicted_class_idx != 0:
        ...         print(text[span[0] : span[1]], model.config.id2label[predicted_class_idx])
        Beyoncé PER
        Los Angeles LOC
        ```NTr  rb   r   r   r   c              3      K   | ]}||V  	d S r   r%   r!  s     r'   r#  z:LukeForEntitySpanClassification.forward.<locals>.<genexpr>d  r  r&   r  )r_   rt  rG  r%  ri   ru   rv   re   rg   r!   gatherr   r   r\   r  r  r   r   r  r   r  r  r   r$   r1   r   r2   r;   )r^   rm   r   rn   rl   r   ro  rp  rq  r  r  r   ro   r  r   r'  r(  r   rQ   start_states
end_statesr  r/   r,   s                           r'   rr   z'LukeForEntitySpanClassification.forward  s   ^ &1%<kk$+B]))))%!"7"7 3'/!5  
 
 /44R88!7!A!A"!E!E!L!LRQSU`!a!a!(G,E,LLL%;%>%>w?X?_%`%`"|G$=rCYZZ3==bAAHHRQ\]]&'*C*JJJ#7#:#:7;T;[#\#\ \'";RAUVV
L*g>^#_efgggn5500YYv}--F {a}226;;r4?3S3SU[U`U`acUdUdee}EEfkkRTooW]WbWbceWfWfWnWnouWvWvww 	  (=w?[]d]op      .!/!(!=)
 
 
 	
r&   )NNNNNNNNNNNNNNNN)r   r   r   rN   r   r   r!   r   r"   r  r   r$   r;   rr   rx   ry   s   @r'   r  r    s       
 
 
 
 
  156:593715<@<@:>=A;?1559-1,0/3&*#H
 H
E,-H
 !!23H
 !!12	H

 u/0H
 U-.H
  ((89H
  ((89H
 &e&67H
 !))9 :H
 'u'78H
 E-.H
   12H
 )*H
 $D>H
  'tn!H
" d^#H
$ 
u44	5%H
 H
 H
 ^H
 H
 H
 H
 H
r&   r  z
    The LUKE Model transformer with a sequence classification/regression head on top (a linear layer on top of the
    pooled output) e.g. for GLUE tasks.
    c            "           e Zd Z fdZe	 	 	 	 	 	 	 	 	 	 	 	 	 	 ddeej                 deej                 deej                 deej                 deej                 deej                 d	eej                 d
eej                 deej                 deej                 deej                 dee	         dee	         dee	         de
eef         fd            Z xZS )LukeForSequenceClassificationc                 R   t                                          |           |j        | _        t          |          | _        t          j        |j        |j        n|j                  | _	        t          j
        |j        |j                  | _        |                                  d S r   rM   rN   r  rZ  rG  r   rZ   classifier_dropoutr[   r\   r   rQ   r  r_  r]   s     r'   rN   z&LukeForSequenceClassification.__init__z  s        +f%%	z)/)B)NF%%TZTn
 
 )F$68IJJ 	r&   Nrm   r   rn   rl   r   ro  rp  rq  r   ro   r  r   r'  r(  r   c                 ^   ||n| j         j        }|                     |||||||||	|
||d          }|j        }|                     |          }|                     |          }d}|t|                    |j                  }| j         j        f| j	        dk    rd| j         _        nN| j	        dk    r7|j
        t          j        k    s|j
        t          j        k    rd| j         _        nd| j         _        | j         j        dk    rWt                      }| j	        dk    r1 ||                                |                                          }n |||          }n| j         j        dk    rGt!                      } ||                    d| j	                  |                    d                    }n*| j         j        dk    rt%                      } |||          }|s-t'          d	 |||j        |j        |j        fD                       S t/          |||j        |j        |j        
          S )a  
        entity_ids (`torch.LongTensor` of shape `(batch_size, entity_length)`):
            Indices of entity tokens in the entity vocabulary.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.
        entity_attention_mask (`torch.FloatTensor` of shape `(batch_size, entity_length)`, *optional*):
            Mask to avoid performing attention on padding entity token indices. Mask values selected in `[0, 1]`:

            - 1 for entity tokens that are **not masked**,
            - 0 for entity tokens that are **masked**.
        entity_token_type_ids (`torch.LongTensor` of shape `(batch_size, entity_length)`, *optional*):
            Segment token indices to indicate first and second portions of the entity token inputs. Indices are
            selected in `[0, 1]`:

            - 0 corresponds to a *portion A* entity token,
            - 1 corresponds to a *portion B* entity token.
        entity_position_ids (`torch.LongTensor` of shape `(batch_size, entity_length, max_mention_length)`, *optional*):
            Indices of positions of each input entity in the position embeddings. Selected in the range `[0,
            config.max_position_embeddings - 1]`.
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        NTr  r   
regressionsingle_label_classificationmulti_label_classificationrb   c              3      K   | ]}||V  	d S r   r%   r!  s     r'   r#  z8LukeForSequenceClassification.forward.<locals>.<genexpr>  r  r&   r  )r_   rt  rG  rs  r\   r  rg   re   problem_typer  rd   r!   rk   r   r	   squeezer   r   r   r$   r1   r   r2   r=   )r^   rm   r   rn   rl   r   ro  rp  rq  r   ro   r  r   r'  r(  r   r8  r/   r,   loss_fcts                       r'   rr   z%LukeForSequenceClassification.forward  sZ   V &1%<kk$+B]))))%!"7"7 3'/!5  
 
   -]33//YYv}--F{'/?a''/;DK,,_q((flej.H.HFL\a\eLeLe/LDK,,/KDK,{'<77"99?a''#8FNN$4$4fnn6F6FGGDD#8FF33DD)-JJJ+--xB @ @&++b//RR)-III,..x// 	  (=w?[]d]op      ,!/!(!=)
 
 
 	
r&   r  )r   r   r   rN   r   r   r!   r   r"   r  r   r$   r=   rr   rx   ry   s   @r'   r  r  s  s       
 
 
 
 
  156:593715=A<@:>1559.2,0/3&*g
 g
E,-g
 !!23g
 !!12	g

 u/0g
 U-.g
  ((9:g
  ((89g
 &e&67g
 E-.g
   12g
 *+g
 $D>g
 'tng
 d^g
  
u22	3!g
 g
 g
 ^g
 g
 g
 g
 g
r&   r  z
    The LUKE Model with a token classification head on top (a linear layer on top of the hidden-states output). To
    solve Named-Entity Recognition (NER) task using LUKE, `LukeForEntitySpanClassification` is more suitable than this
    class.
    c            "           e Zd Z fdZe	 	 	 	 	 	 	 	 	 	 	 	 	 	 ddeej                 deej                 deej                 deej                 deej                 deej                 d	eej                 d
eej                 deej                 deej                 deej                 dee	         dee	         dee	         de
eef         fd            Z xZS )LukeForTokenClassificationc                 V   t                                          |           |j        | _        t          |d          | _        t          j        |j        |j        n|j                  | _	        t          j
        |j        |j                  | _        |                                  d S NF)r[  r  r]   s     r'   rN   z#LukeForTokenClassification.__init__  s        +f>>>	z)/)B)NF%%TZTn
 
 )F$68IJJ 	r&   Nrm   r   rn   rl   r   ro  rp  rq  r   ro   r  r   r'  r(  r   c                 4   ||n| j         j        }|                     |||||||||	|
||d          }|j        }|                     |          }|                     |          }d}|`|                    |j                  }t                      } ||	                    d| j
                  |	                    d                    }|s-t          d |||j        |j        |j        fD                       S t          |||j        |j        |j                  S )aM  
        entity_ids (`torch.LongTensor` of shape `(batch_size, entity_length)`):
            Indices of entity tokens in the entity vocabulary.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.
        entity_attention_mask (`torch.FloatTensor` of shape `(batch_size, entity_length)`, *optional*):
            Mask to avoid performing attention on padding entity token indices. Mask values selected in `[0, 1]`:

            - 1 for entity tokens that are **not masked**,
            - 0 for entity tokens that are **masked**.
        entity_token_type_ids (`torch.LongTensor` of shape `(batch_size, entity_length)`, *optional*):
            Segment token indices to indicate first and second portions of the entity token inputs. Indices are
            selected in `[0, 1]`:

            - 0 corresponds to a *portion A* entity token,
            - 1 corresponds to a *portion B* entity token.
        entity_position_ids (`torch.LongTensor` of shape `(batch_size, entity_length, max_mention_length)`, *optional*):
            Indices of positions of each input entity in the position embeddings. Selected in the range `[0,
            config.max_position_embeddings - 1]`.
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the multiple choice classification loss. Indices should be in `[0, ...,
            num_choices-1]` where `num_choices` is the size of the second dimension of the input tensors. (See
            `input_ids` above)
        NTr  rb   c              3      K   | ]}||V  	d S r   r%   r!  s     r'   r#  z5LukeForTokenClassification.forward.<locals>.<genexpr>P  r  r&   r  )r_   rt  rG  r%  r\   r  rg   re   r   r   r  r$   r1   r   r2   r?   )r^   rm   r   rn   rl   r   ro  rp  rq  r   ro   r  r   r'  r(  r   r  r/   r,   r  s                       r'   rr   z"LukeForTokenClassification.forward  sS   V &1%<kk$+B]))))%!"7"7 3'/!5  
 
  "3,,7711YYv}--F'))H8FKKDO<<fkk"ooNND 	  (=w?[]d]op      )!/!(!=)
 
 
 	
r&   r  )r   r   r   rN   r   r   r!   r   r"   r  r   r$   r?   rr   rx   ry   s   @r'   r  r    s             156:593715=A<@:>1559.2,0/3&*U
 U
E,-U
 !!23U
 !!12	U

 u/0U
 U-.U
  ((9:U
  ((89U
 &e&67U
 E-.U
   12U
 *+U
 $D>U
 'tnU
 d^U
  
u//	0!U
 U
 U
 ^U
 U
 U
 U
 U
r&   r  c            $           e Zd Z fdZe	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 ddeej                 deej                 deej                 deej                 deej                 deej                 d	eej                 d
eej                 deej                 deej                 deej                 deej                 dee	         dee	         dee	         de
eef         f d            Z xZS )LukeForQuestionAnsweringc                     t                                          |           |j        | _        t          |d          | _        t          j        |j        |j                  | _        | 	                                 d S r  )
rM   rN   r  rZ  rG  r   r   rQ   
qa_outputsr_  r]   s     r'   rN   z!LukeForQuestionAnswering.__init__a  sj        +f>>>	)F$68IJJ 	r&   Nrm   r   rn   rl   r   ro  rp  rq  r   ro   start_positionsend_positionsr   r'  r(  r   c                    ||n| j         j        }|                     |||||||||	|
||d          }|j        }|                     |          }|                    dd          \  }}|                    d          }|                    d          }d}||t          |                                          dk    r|                    d          }t          |                                          dk    r|                    d          }|                    d          }|	                    d|           |	                    d|           t          |          } |||          } |||          }||z   d	z  }|s.t          d
 ||||j        |j        |j        fD                       S t          ||||j        |j        |j                  S )a  
        entity_ids (`torch.LongTensor` of shape `(batch_size, entity_length)`):
            Indices of entity tokens in the entity vocabulary.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.
        entity_attention_mask (`torch.FloatTensor` of shape `(batch_size, entity_length)`, *optional*):
            Mask to avoid performing attention on padding entity token indices. Mask values selected in `[0, 1]`:

            - 1 for entity tokens that are **not masked**,
            - 0 for entity tokens that are **masked**.
        entity_token_type_ids (`torch.LongTensor` of shape `(batch_size, entity_length)`, *optional*):
            Segment token indices to indicate first and second portions of the entity token inputs. Indices are
            selected in `[0, 1]`:

            - 0 corresponds to a *portion A* entity token,
            - 1 corresponds to a *portion B* entity token.
        entity_position_ids (`torch.LongTensor` of shape `(batch_size, entity_length, max_mention_length)`, *optional*):
            Indices of positions of each input entity in the position embeddings. Selected in the range `[0,
            config.max_position_embeddings - 1]`.
        NTr  r   rb   r   r   )ignore_indexr   c              3      K   | ]}||V  	d S r   r%   r!  s     r'   r#  z3LukeForQuestionAnswering.forward.<locals>.<genexpr>  s4         =  !=== r&   )r,   rB   rC   r1   r   r2   )r_   rt  rG  r%  r  splitr  lenri   clamp_r   r$   r1   r   r2   rA   )r^   rm   r   rn   rl   r   ro  rp  rq  r   ro   r  r  r   r'  r(  r   r  r/   rB   rC   
total_lossignored_indexr  
start_lossend_losss                             r'   rr   z LukeForQuestionAnswering.forwardl  s/   P &1%<kk$+B]))))%!"7"7 3'/!5  
 
  "311#)<<r<#:#: j#++B//''++

&=+D?''))**Q.."1"9"9""="==%%''((1,, - 5 5b 9 9(--a00M""1m444  M222']CCCH!,@@Jx
M::H$x/14J 	    )0&      0%!!/!(!=)
 
 
 	
r&   r  )r   r   r   rN   r   r   r!   r   r"   r  r   r$   rA   rr   rx   ry   s   @r'   r  r  _  s       	 	 	 	 	  156:594815=A<@:>15596:48,0/3&*!f
 f
E,-f
 !!23f
 !!12	f

 u01f
 U-.f
  ((9:f
  ((89f
 &e&67f
 E-.f
   12f
 "%"23f
   01f
 $D>f
 'tnf
  d^!f
" 
u66	7#f
 f
 f
 ^f
 f
 f
 f
 f
r&   r  c            "           e Zd Z fdZe	 	 	 	 	 	 	 	 	 	 	 	 	 	 ddeej                 deej                 deej                 deej                 deej                 deej                 d	eej                 d
eej                 deej                 deej                 deej                 dee	         dee	         dee	         de
eef         fd            Z xZS )LukeForMultipleChoicec                 0   t                                          |           t          |          | _        t	          j        |j        |j        n|j                  | _        t	          j	        |j
        d          | _        |                                  d S r  )rM   rN   rZ  rG  r   rZ   r  r[   r\   r   rQ   r  r_  r]   s     r'   rN   zLukeForMultipleChoice.__init__  s       f%%	z)/)B)NF%%TZTn
 
 )F$6:: 	r&   Nrm   r   rn   rl   r   ro  rp  rq  r   ro   r  r   r'  r(  r   c                    ||n| j         j        }||j        d         n|
j        d         }|)|                    d|                    d                    nd}|)|                    d|                    d                    nd}|)|                    d|                    d                    nd}|)|                    d|                    d                    nd}|
=|
                    d|
                    d          |
                    d                    nd}
|)|                    d|                    d                    nd}|)|                    d|                    d                    nd}|)|                    d|                    d                    nd}|=|                    d|                    d          |                    d                    nd}|                     |||||||||	|
||d          }|j        }|                     |          }|                     |          }|                    d|          }d}|4|	                    |j
                  }t                      } |||          }|s-t          d |||j        |j        |j        fD                       S t!          |||j        |j        |j                  S )	a^  
        input_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`):
            Indices of input sequence tokens in the vocabulary.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are input IDs?](../glossary#input-ids)
        token_type_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`, *optional*):
            Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,
            1]`:

            - 0 corresponds to a *sentence A* token,
            - 1 corresponds to a *sentence B* token.

            [What are token type IDs?](../glossary#token-type-ids)
        position_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`, *optional*):
            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
            config.max_position_embeddings - 1]`.

            [What are position IDs?](../glossary#position-ids)
        entity_ids (`torch.LongTensor` of shape `(batch_size, entity_length)`):
            Indices of entity tokens in the entity vocabulary.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.
        entity_attention_mask (`torch.FloatTensor` of shape `(batch_size, entity_length)`, *optional*):
            Mask to avoid performing attention on padding entity token indices. Mask values selected in `[0, 1]`:

            - 1 for entity tokens that are **not masked**,
            - 0 for entity tokens that are **masked**.
        entity_token_type_ids (`torch.LongTensor` of shape `(batch_size, entity_length)`, *optional*):
            Segment token indices to indicate first and second portions of the entity token inputs. Indices are
            selected in `[0, 1]`:

            - 0 corresponds to a *portion A* entity token,
            - 1 corresponds to a *portion B* entity token.
        entity_position_ids (`torch.LongTensor` of shape `(batch_size, entity_length, max_mention_length)`, *optional*):
            Indices of positions of each input entity in the position embeddings. Selected in the range `[0,
            config.max_position_embeddings - 1]`.
        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, num_choices, sequence_length, hidden_size)`, *optional*):
            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
            model's internal embedding lookup matrix.
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the multiple choice classification loss. Indices should be in `[0, ...,
            num_choices-1]` where `num_choices` is the size of the second dimension of the input tensors. (See
            `input_ids` above)
        Nr   rb   r   Tr  c              3      K   | ]}||V  	d S r   r%   r!  s     r'   r#  z0LukeForMultipleChoice.forward.<locals>.<genexpr>c  r$  r&   r  )r_   rt  r  r   ri   rG  rs  r\   r  rg   re   r   r$   r1   r   r2   rE   )r^   rm   r   rn   rl   r   ro  rp  rq  r   ro   r  r   r'  r(  num_choicesr   r8  r/   reshaped_logitsr,   r  s                         r'   rr   zLukeForMultipleChoice.forward  sT   F &1%<kk$+B],5,Aioa((}GZ[\G]>G>SINN2y~~b'9'9:::Y]	M[Mg,,R1D1DR1H1HIIImqM[Mg,,R1D1DR1H1HIIImqGSG_|((\->->r-B-BCCCei ( r=#5#5b#9#9=;M;Mb;Q;QRRR 	 BLAWZ__R)<)<===]a
 %0 "&&r+@+E+Eb+I+IJJJ 	 %0 "&&r+@+E+Eb+I+IJJJ 	 #.  $$R)<)A)A")E)EGZG_G_`bGcGcddd 	 ))))%!"7"7 3'/!5  
 
   -]33// ++b+66YY566F'))H8OV44D 	 
 
 #)0&
 
 
 
 
 
 -"!/!(!=)
 
 
 	
r&   r  )r   r   r   rN   r   r   r!   r   r"   r  r   r$   rE   rr   rx   ry   s   @r'   r  r    s       
 
 
 
 
  156:593715=A<@:>1559.2,0/3&*P
 P
E,-P
 !!23P
 !!12	P

 u/0P
 U-.P
  ((9:P
  ((89P
 &e&67P
 E-.P
   12P
 *+P
 $D>P
 'tnP
 d^P
  
u33	4!P
 P
 P
 ^P
 P
 P
 P
 P
r&   r  )
r  r  r  r  r  r  r  r  rZ  rF  )Fr    r   dataclassesr   typingr   r   r!   r   torch.nnr   r   r	   activationsr   r   modeling_layersr   modeling_outputsr   r   modeling_utilsr   pytorch_utilsr   utilsr   r   r   configuration_luker   
get_loggerr   loggerr   r)   r+   r4   r9   r;   r=   r?   rA   rE   rX  rG   r{   r   r   r   r   r   r  r  r2  r:  r?  rF  rZ  rf   r  r  r  r  r  r  r  r  r  __all__r%   r&   r'   <module>r     sP
      ! ! ! ! ! ! " " " " " " " "        A A A A A A A A A A ' ' ' ' ' ' ' ' 9 9 9 9 9 9 K K K K K K K K - - - - - - 6 6 6 6 6 6 9 9 9 9 9 9 9 9 9 9 * * * * * * 
	H	%	%   
I I I I I%? I I  I"   
I I I I I/ I I  I   
? ? ? ? ? ? ?  ?8   
? ? ? ? ? ? ?  ?&   
? ? ? ? ?[ ? ?  ?&   
? ? ? ? ?[ ? ?  ?&   
? ? ? ? ?; ? ?  ?&   
? ? ? ? ? ? ?  ?&   
? ? ? ? ?{ ? ?  ?$   
? ? ? ? ?K ? ?  ?*F= F= F= F= F=RY F= F= F=R( ( ( ( (29 ( ( (Vn n n n n	 n n nd    RY   , , , , ,BI , , ,`    ry           1 1 1 1 1* 1 1 1hA
 A
 A
 A
 A
") A
 A
 A
J           BI   "    29    * * * * */ * * *0   
Y' Y' Y' Y' Y'# Y' Y' 
Y'x4 4 4"* * * * * * * *>   L
 L
 L
 L
 L
) L
 L
 L
^   y
 y
 y
 y
 y
"5 y
 y
 y
x   ~
 ~
 ~
 ~
 ~
&9 ~
 ~
 ~
B   V
 V
 V
 V
 V
&9 V
 V
 V
r   u
 u
 u
 u
 u
$7 u
 u
 u
p   d
 d
 d
 d
 d
!4 d
 d
 d
N s
 s
 s
 s
 s
2 s
 s
 s
l ^
 ^
 ^
 ^
 ^
/ ^
 ^
 ^
B  r&   