
     `i                        d Z ddlZddlmZmZ ddlZddlmZ ddlmZ ddl	m
Z
mZmZ ddlmZ dd	lmZmZ dd
lmZ ddlmZmZ ddlmZ ddlmZmZ ddlmZ ddlmZ  ej         e!          Z" G d dej#                  Z$ G d dej%                  Z& G d dej%                  Z' G d de          Z(e G d de                      Z)e G d de)                      Z* ed           G d d e)e                      Z+g d!Z,dS )"zPyTorch XGLM model.    N)OptionalUnion)nn   )ACT2FN)CacheDynamicCacheEncoderDecoderCache)GenerationMixin)_prepare_4d_attention_mask!_prepare_4d_causal_attention_mask)GradientCheckpointingLayer))BaseModelOutputWithPastAndCrossAttentions!CausalLMOutputWithCrossAttentions)PreTrainedModel)auto_docstringlogging)deprecate_kwarg   )
XGLMConfigc            
       \     e Zd ZdZd
dedededee         f fdZdej	        f fd	Z
 xZS )XGLMScaledWordEmbeddingz\
    This module overrides nn.Embeddings' forward by multiplying with embeddings scale.
          ?num_embeddingsembedding_dimpadding_idxembed_scalec                 \    t                                          |||           || _        d S N)super__init__r   )selfr   r   r   r   	__class__s        z/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/transformers/models/xglm/modeling_xglm.pyr!   z XGLMScaledWordEmbedding.__init__,   s-    DDD&    	input_idsc                 V    t                                          |          | j        z  S r   )r    forwardr   )r"   r&   r#   s     r$   r(   zXGLMScaledWordEmbedding.forward0   s!    wwy))D,<<<r%   )r   )__name__
__module____qualname____doc__intr   floatr!   torchTensorr(   __classcell__r#   s   @r$   r   r   '   s         ' 's '3 'S '_ghm_n ' ' ' ' ' '= = = = = = = = = = =r%   r   c            	            e Zd ZdZddededee         f fdZddededee         fdZeddededee         fd	            Z	 e
j                    ddee
j                 defd            Z xZS )!XGLMSinusoidalPositionalEmbeddingzDThis module produces sinusoidal positional embeddings of any length.Nnum_positionsr   r   c                     t                                                       d| _        || _        || _        |                     || j        z   ||           d S )N   )r    r!   offsetr   r   make_weights)r"   r5   r   r   r#   s       r$   r!   z*XGLMSinusoidalPositionalEmbedding.__init__7   sU    *&-$+5}kRRRRRr%   r   c                     |                      |||          }t          | d          r+|                    | j        j        | j        j                  }|                     d|d           d S )NweightsdtypedeviceF)
persistent)get_embeddinghasattrtor;   r=   r>   register_buffer)r"   r   r   r   emb_weightss        r$   r9   z.XGLMSinusoidalPositionalEmbedding.make_weights>   sl    ((TT4## 	_%..t|/A$,J].^^KYFFFFFr%   c                    |dz  }t          j        d          |dz
  z  }t          j        t          j        |t          j                                                  | z            }t          j        | t          j                                                                      d          |                    d          z  }t          j        t          j	        |          t          j
        |          gd                              | d          }|dz  dk    r+t          j        |t          j        | d          gd          }|	d||ddf<   |                    t          j                              S )	z
        Build sinusoidal embeddings.

        This matches the implementation in tensor2tensor, but differs slightly from the description in Section 3.5 of
        "Attention Is All You Need".
        r7   i'  r   )r=   r   dimN)mathlogr/   exparangeint64r.   	unsqueezecatsincosviewzerosrB   get_default_dtype)r   r   r   half_dimembs        r$   r@   z/XGLMSinusoidalPositionalEmbedding.get_embeddingF   s?    !A%huooA.iXU[AAAGGIISDPQQl>===CCEEOOPQRRUXUbUbcdUeUeei338a@@@EEnVXYY1!!)S%+na"@"@AqIIIC""#CQQQvve-//000r%   r   position_idspast_key_values_lengthc                    |                                 \  }}|| j        z  }d|z   |z   }|| j                             d          k    r!|                     || j        | j                   | j                            d|                    d                                        ||| j        j        d                   	                                S )Nr7   r   rH   )
sizer8   r;   r9   r   r   index_selectrR   shapedetach)r"   rW   rX   bszseq_lenmax_poss         r$   r(   z)XGLMSinusoidalPositionalEmbedding.forward[   s    #((**W# g+ 66T\&&q))))gt'94;KLLL|((L,=,=b,A,ABBGGWVZVbVhikVlmmttvvvr%   r   )Nr   )r)   r*   r+   r,   r-   r   r!   r9   staticmethodr@   r/   no_gradr0   r(   r1   r2   s   @r$   r4   r4   4   s6       NNS Sc S# SHUXM S S S S S SG G3 Gs GQYZ]Q^ G G G G 1 1c 1# 1HUXM 1 1 1 \1( U]__	w 	wHU\$: 	w[^ 	w 	w 	w _	w 	w 	w 	w 	wr%   r4   c                       e Zd ZdZ	 	 	 	 ddededee         d	ee         d
ee         dee         f fdZ e	ddd          	 	 	 	 	 	 dde
j        dee
j                 dee         dee
j                 dee
j                 dedee
j                 dee
j        ee
j                 eee
j                          f         fd            Z xZS )XGLMAttentionz=Multi-headed attention from 'Attention Is All You Need' paper        FTN	embed_dim	num_headsdropout
is_decoderbias	layer_idxc                    t                                                       || _        || _        || _        ||z  | _        | j        |z  | j        k    rt          d| j         d| d          | j        dz  | _        || _        || _	        t          j        |||          | _        t          j        |||          | _        t          j        |||          | _        t          j        |||          | _        d S )Nz;embed_dim must be divisible by num_heads (got `embed_dim`: z and `num_heads`: z).g      ࿩rj   )r    r!   rf   rg   rh   head_dim
ValueErrorscalingri   rk   r   Lineark_projv_projq_projout_proj)r"   rf   rg   rh   ri   rj   rk   r#   s          r$   r!   zXGLMAttention.__init__k   s	    	""!Y.MI%$.883dn 3 3%.3 3 3   }d*$"i	94@@@i	94@@@i	94@@@	)YTBBBr%   past_key_valuepast_key_values4.58new_nameversionhidden_stateskey_value_statesattention_masklayer_head_maskoutput_attentionscache_positionreturnc                    |du}|                                 \  }	}
}|r|j        d         n|
}|                     |          | j        z  }d}|Ht	          |t
                    r1|j                            | j                  }|r|j	        }n
|j
        }n|}|r|n|}|r3|1|r/|j        | j                 j        }|j        | j                 j        }n|                     |          }|                     |          }|                    |	|d| j                                      dd          }|                    |	|d| j                                      dd          }|N|s|nd}|                    ||| j        d|i          \  }}|r$t	          |t
                    rd|j        | j        <   |	| j        z  d| j        f}|                    |	|
| j        | j                                      dd          } |j        | } |j        | } |j        | }|                     d          }t-          j        ||                    dd                    }|                                 |	| j        z  |
|fk    r2t1          d|	| j        z  |
|f d	|                                            ||                                 |	d|
|fk    r+t1          d
|	d|
|f d	|                                            |                    |	| j        |
|          |z   }t-          j        |t-          j        t-          j        |j                  j        |j                            }|                    |	| j        z  |
|          }|j        t,          j        k    rJt@          j!        "                    |dt,          j#                  $                    t,          j                  }n!t@          j!        "                    |d          }||                                 | j        fk    r-t1          d| j        f d	|                                            |                    dddd          |                    |	| j        |
|          z  }|                    |	| j        z  |
|          }|r=|                    |	| j        |
|          }|                    |	| j        z  |
|          }nd}t@          j!        %                    || j%        | j&                  }t-          j        ||          }|                                 |	| j        z  |
| j        fk    r5t1          d|	| j        |
| j        f d	|                                            |                    |	| j        |
| j                  }|                    dd          }|                    |	|
| j'                  }| (                    |          }||fS )z#Input shape: Batch x Time x ChannelNr   FrH   r7   r   Tz$Attention weights should be of size z	, but is z!Attention mask should be of size )r>   )rG   r=   rF   z/Head mask for a single layer should be of size ptrainingz `attn_output` should be of size ))rZ   r\   rt   rp   
isinstancer
   
is_updatedgetrk   cross_attention_cacheself_attention_cachelayerskeysvaluesrr   rs   rR   rn   	transposeupdaterg   reshaper/   bmmro   maxtensorfinfor=   minr>   float16r   
functionalsoftmaxfloat32rB   rh   r   rf   ru   )r"   r|   r}   rw   r~   r   r   r   is_cross_attentionr^   tgt_len_src_lenquery_statesr   curr_past_key_valuecurrent_states
key_statesvalue_states
proj_shapeattn_weightsattn_weights_reshaped
attn_probsattn_outputs                           r$   r(   zXGLMAttention.forward   sP    .T9',,..Wa/AN"(++w {{=11DL@
&/+>?? 6,7;;DNKK
% O*9*O''*9*N''&5#-?R))] 	F/"=*"=,3DNCHJ.5dnELLL^44J;;~66L#gr4=IISSTUWXYYJ',,S'2t}MMWWXY[\]]L*7I!St+>+E+Edn?OQ_>`, ,(
L & F*_FY*Z*Z FAEO.t~>DN*B>
#((gt~t}UU__`acdee+|+Z8'Z'4
+|+Z8//!$$yz/C/CAq/I/IJJ3#7'"JJJ*dn8LgW^7_ * * %%''* *  
 %""$$a'(BBB ta'8Rtt]k]p]p]r]rtt   (,,S$.'7SSVddL 9el5;|7I+J+J+NWcWjkkk L (,,S4>-A7GTTL ..=002U]0[[^^_d_lmmLL=0020FFL&##%%$.)::: 1t~FW 1 1',,..1 1   +//2q!<<|?P?PQTVZVdfmov?w?wwL',,S4>-A7GTTL 	)
 %1$5$5c4>7T[$\$\!055cDN6JGU\]]LL$(!]**<4<RVR_*``
i
L99#"6!OOO)CRVR_3` ) )$$&&) )  
 "&&sDNGT]SS!++Aq11 "))#wGGmmK00111r%   )re   FTN)NNNNFN)r)   r*   r+   r,   r-   r   r.   boolr!   r   r/   r0   r   tupler(   r1   r2   s   @r$   rd   rd   h   s       GG $'%*#$(C CC C %	C
 TNC tnC D>C C C C C C: _%0A6RRR 48+/1526"'15}2 }2|}2 #5<0}2 "%	}2
 !.}2 "%,/}2  }2 !.}2 
u|Xel3XeEL>Q5RR	S}2 }2 }2 SR}2 }2 }2 }2 }2r%   rd   c                   R    e Zd Zddef fdZ eddd          	 	 	 	 	 	 	 	 	 dd
ej        deej                 deej                 deej                 deej                 deej                 dee	         dee
         dee
         deej                 dej        fd            Z xZS )XGLMDecoderLayerNconfigc                    t                                                       |j        | _        t	          | j        |j        |j        d|          | _        |j        | _        t          |j
                 | _        |j        | _        |j        rFt	          | j        |j        |j        d|          | _        t          j        | j                  | _        t          j        | j                  | _        t          j        | j        |j                  | _        t          j        |j        | j                  | _        t          j        | j                  | _        d S )NT)rf   rg   rh   ri   rk   )r    r!   d_modelrf   rd   attention_headsattention_dropout	self_attnrh   r   activation_functionactivation_fnactivation_dropoutadd_cross_attentionencoder_attnr   	LayerNormencoder_attn_layer_normself_attn_layer_normrq   ffn_dimfc1fc2final_layer_norm)r"   r   rk   r#   s      r$   r!   zXGLMDecoderLayer.__init__
  s!   &n,,
 
 
 ~#F$>?"(";% 	H -. 00#! ! !D ,.<+G+GD($&L$@$@!9T^V^<<9V^T^<< "T^ < <r%   rv   rw   rx   ry   FTr|   r~   encoder_hidden_statesencoder_attention_maskr   cross_attn_layer_head_maskr   	use_cacher   r   c           	      "   |}|                      |          }|                     ||||||
          \  }}t          j                            || j        | j                  }||z   }d}|g|}|                     |          }|                     |||||||
          \  }}t          j                            || j        | j                  }||z   }|}|                     |          }| 	                    | 
                    |                    }t          j                            || j        | j                  }|                     |          }t          j                            || j        | j                  }||z   }|f}|r|||fz  }|S )a  
        Args:
            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
            attention_mask (`torch.FloatTensor`): attention mask of size
                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
            encoder_hidden_states (`torch.FloatTensor`):
                cross attention input to the layer of shape `(batch, seq_len, embed_dim)`
            encoder_attention_mask (`torch.FloatTensor`): encoder attention mask of size
                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
            layer_head_mask (`torch.FloatTensor`): mask for attention heads in a given layer of size
                `(encoder_attention_heads,)`.
            cross_attn_layer_head_mask (`torch.FloatTensor`): mask for cross-attention heads in a given layer of
                size `(decoder_attention_heads,)`.
            past_key_values (`Cache`): cached past key and value projection states
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
        )r|   rw   r~   r   r   r   r   N)r|   r}   r~   r   rw   r   r   )r   r   r   r   rh   r   r   r   r   r   r   r   r   )r"   r|   r~   r   r   r   r   rw   r   r   r   residualself_attn_weightscross_attn_weightsoutputss                  r$   r(   zXGLMDecoderLayer.forward(  s   B !11-@@ ,0>>'+)+/) ,: ,
 ,
(( --mt|VZVc-dd =0 " ,$H 88GGM040A0A+!65 : /"3- 1B 1 1-M- M11-4<Z^Zg1hhM$}4M !--m<<**488M+B+BCC--mt?Vaean-oo//--mt|VZVc-dd =0 " 	?)+=>>Gr%   r   )	NNNNNNFTN)r)   r*   r+   r   r!   r   r/   r0   r   r   r   r(   r1   r2   s   @r$   r   r   	  sZ       = =z = = = = = =< _%0A6RRR
 268<9=26=A+/,1$(15N N|N !.N  (5	N
 !) 6N "%,/N %-U\$:N "%N $D>N D>N !.N 
N N N SRN N N N Nr%   r   c                   .    e Zd ZU eed<   dZdZdgZd ZdS )XGLMPreTrainedModelr   modelTr   c                    | j         j        }t          |t          j                  rJ|j        j                            d|           |j         |j        j        	                                 d S d S t          |t          j
                  rS|j        j                            d|           |j        -|j        j        |j                 	                                 d S d S d S )Nre   )meanstd)r   init_stdr   r   rq   weightdatanormal_rj   zero_	Embeddingr   )r"   moduler   s      r$   _init_weightsz!XGLMPreTrainedModel._init_weights  s    k"fbi(( 	?M&&CS&999{& &&((((( '&-- 	?M&&CS&999!-"6#56<<>>>>>	? 	?--r%   N)	r)   r*   r+   r   __annotations__base_model_prefixsupports_gradient_checkpointing_no_split_modulesr    r%   r$   r   r   {  sF         &*#+,	? 	? 	? 	? 	?r%   r   c            "           e Zd Zddedeej                 f fdZe	 	 	 	 	 	 	 	 	 	 	 	 	 	 ddee	j
                 dee	j
                 dee	j
                 dee	j
                 d	ee	j
                 d
ee	j
                 dee	j
                 dee         dee	j
                 dee         dee         dee         dee         dee	j
                 deee	j
                 ef         fd            Z xZS )	XGLMModelNr   embed_tokensc                 |   t                                                     j        | _        j        | _        j        | _        j        | _        j        rt          j
        j                  nd}||| _        n't          j        j        | j        |          | _        t          j        j        j                  | _        t#          j        fdt'          j                  D                       | _        t#          j        j                  | _        d| _        |                                  dS )zZ
        embed_tokens (`nn.Embedding`, *optional*):
            output embeddings
        r   N)r   c                 2    g | ]}t          |           S ))rk   )r   ).0ir   s     r$   
<listcomp>z&XGLMModel.__init__.<locals>.<listcomp>  s'    $m$m$mq%5f%J%J%J$m$m$mr%   F)r    r!   rh   	layerdroppad_token_idr   max_position_embeddingsmax_target_positionsscale_embeddingrI   sqrtr   r   r   
vocab_sizer4   embed_positionsr   
ModuleListrange
num_layersr   r   
layer_normgradient_checkpointing	post_init)r"   r   r   r   r#   s    `  r$   r!   zXGLMModel.__init__  s+   
 	   ~)!.$*$B!393IRdi///s# ,D 7!6>43CQ\! ! !D  A*N 
  

 m$m$m$m$mTYZ`ZkTlTl$m$m$mnn,v~66&+#r%   r&   r~   rW   r   r   	head_maskcross_attn_head_maskrw   inputs_embedsr   r   output_hidden_statesreturn_dictr   r   c                 f   ||n| j         j        }||n| j         j        }|
|
n| j         j        }
||n| j         j        }||	t          d          |G|                     ||           |                                }|                    d|d                   }n.|	|	                                dd         }nt          d          |	| 	                    |          }	| j
        r%| j        r|
rt                              d           d}
|
rO|M|6t          t          | j                   t          | j                             nt          | j                   }|
rCt!          |t"                    r.t                              d           t          j        |          }||                                nd	}t)          |||	|          }|Nt+          j        ||d         |z   t*          j        ||j        n|	j        
          }|                    d	          }||t5          ||	j        |d                   }|	|                     ||                              |	j                  z   }t<          j                             |tC          | j                   | j                  }|rdnd}|rdnd}|r|dnd}tE          ||gddg          D ]z\  }}|s|                                d	         tG          | j$                  k    rCt          d| dtG          | j$                   d|                                d	          d          {tK          | j$                  D ]\  }}|r||fz  }| j        r t+          j&        g           }|| j'        k     r4 ||||||||         nd|||         nd|||
|
  
        }|d	         }|r||d         fz  }|||d         fz  }| (                    |          }|r||fz  }|st#          d |||||fD                       S tS          |||||          S )a  
        encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, encoder_sequence_length, hidden_size)`, *optional*):
            Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention of
            the decoder.
        encoder_attention_mask (`torch.LongTensor` of shape `(batch_size, encoder_sequence_length)`, *optional*):
            Mask to avoid performing cross-attention on padding tokens indices of encoder input_ids. Mask values
            selected in `[0, 1]`:

            - 1 for tokens that are **not masked**,
            - 0 for tokens that are **masked**.

            [What are attention masks?](../glossary#attention-mask)
        cross_attn_head_mask (`torch.Tensor` of shape `(num_layers, attention_heads)`, *optional*):
            Mask to nullify selected heads of the cross-attention modules. Mask values selected in `[0, 1]`:

            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.
        NzDYou cannot specify both input_ids and inputs_embeds at the same timerH   z5You have to specify either input_ids or inputs_embedsz_`use_cache = True` is incompatible with gradient checkpointing`. Setting `use_cache = False`...F)r   zPassing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.58.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.r   r<   )r   r   r   r   r   zThe `z` should be specified for z layers, but it is for .)r   r   r   rw   r   r   r   r   r7   c              3      K   | ]}||V  	d S r   r   )r   vs     r$   	<genexpr>z$XGLMModel.forward.<locals>.<genexpr>J  s0        =  === r%   )last_hidden_staterw   r|   
attentionscross_attentions)*r   r   r   r   use_return_dictro   %warn_if_padding_and_no_attention_maskrZ   rR   r   r   r   loggerwarning_oncer
   r	   r   r   from_legacy_cacheget_seq_lengthr   r/   rL   longr>   rN   r   r=   r   rB   r   r   rh   r.   ziplenr   	enumeraterandr   r   r   )r"   r&   r~   rW   r   r   r   r   rw   r   r   r   r   r   r   input_shaperX   r|   all_hidden_statesall_self_attnsall_cross_attentions	attn_mask	mask_nameidxdecoder_layerdropout_probabilitylayer_outputss                              r$   r(   zXGLMModel.forward  s~   H 2C1N--TXT_Tq$8$D  $+Jj 	 "+!6IIDK<Q	%0%<kk$+B]  ]%>cddd"66y.QQQ#..**K!r;r?;;II&',,..ss3KKTUUU  --i88M& 	"4= 	" "##u   "	  	0 )4 $L$D$D$DlZ^ZeFfFfFfggg!555 
  	UOU;; 	U\  
 2COTTOETE`!?!?!A!A!Afg:K8N
 
  <&B"88j+4+@y''mFZ	  L (11!44L !,1G1S%?&(;[QS_& & &" &(<(<\Ka(b(b(e(e )
 )
 
 --muT\?R?R]a]j-kk #7@BBD0:d&7h<Q<]rrdh %(4H(IKYoKp$q$q 	 	 Iy$>>##A&#dk*:*:::$3	 3 3SEUEU 3 3%NN,,Q/3 3 3   #,DK"8"8 	@ 	@C# 6!m%55!} &+jnn#&77)M%'=3<3H3dI]Ii,@,E,Eos /"3#-  M *!,M  @=#3"55(4(]1-=,??(66   	2-!11 	  ':K^]qr     
 9+++%1
 
 
 	
r%   r   )NNNNNNNNNNNNNN)r)   r*   r+   r   r   r   r   r!   r   r/   r0   r   r   r   r   r   r(   r1   r2   s   @r$   r   r     s        z ",9O      >  -115/38<9=,07;+/04$(,0/3&*15e
 e
EL)e
 !.e
 u|,	e

  (5e
 !) 6e
 EL)e
 'u|4e
 "%e
  -e
 D>e
 $D>e
 'tne
 d^e
 !.e
  
uU\"$MM	N!e
 e
 e
 ^e
 e
 e
 e
 e
r%   r   z
    The XGLM Model transformer with a language modeling head on top (linear layer with weights tied to the input
    embeddings).
    )custom_introc            $           e Zd ZdZdgZ fdZe	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 ddeej	                 deej	                 deej	                 deej	                 d	eej	                 d
eej	                 deej	                 dee
         deej	                 deej	                 dee         dee         dee         dee         deej	                 deeej	                 ef         f d            Z xZS )XGLMForCausalLMr   zlm_head.weightc                     t                                          |           t          |          | _        t	          j        |j        |j        d          | _        | 	                                 d S )NFrm   )
r    r!   r   r   r   rq   hidden_sizer   lm_headr   )r"   r   r#   s     r$   r!   zXGLMForCausalLM.__init__b  s`       v&&
y!3V5FUSSS 	r%   Nr&   r~   rW   r   r   r   r   rw   r   labelsr   r   r   r   r   r   c                    ||n| j         j        }||n| j         j        }||n| j         j        }|                     |||||||||	|||||          }|                     |d                   }d}|
& | j        ||
f| j         j        | j         j        d|}|s|f|dd         z   }||f|z   n|S t          |||j
        |j        |j        |j                  S )a  
        encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, encoder_sequence_length, hidden_size)`, *optional*):
            Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention of
            the decoder.
        encoder_attention_mask (`torch.LongTensor` of shape `(batch_size, encoder_sequence_length)`, *optional*):
            Mask to avoid performing cross-attention on padding tokens indices of encoder input_ids. Mask values
            selected in `[0, 1]`:

            - 1 for tokens that are **not masked**,
            - 0 for tokens that are **masked**.

            [What are attention masks?](../glossary#attention-mask)
        cross_attn_head_mask (`torch.Tensor` of shape `(num_layers, attention_heads)`, *optional*):
            Mask to nullify selected heads of the cross-attention modules. Mask values selected in `[0, 1]`:

            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
        N)r&   r~   rW   r   r   r   r   rw   r   r   r   r   r   r   r   )r   r   r   )losslogitsrw   r|   r   r   )r   r   r   r   r   r  loss_functionr   r   r   rw   r|   r   r   )r"   r&   r~   rW   r   r   r   r   rw   r   r  r   r   r   r   r   kwargsr   r  r  outputs                        r$   r(   zXGLMForCausalLM.forwardj  s\   V 2C1N--TXT_Tq$8$D  $+Jj 	 &1%<kk$+B] **)%"7#9!5+'/!5#)  
 
" gaj))%4%  ;1![5	 
  D  	DY,F'+'7D7V##VC0#3!/)$5
 
 
 	
r%   )NNNNNNNNNNNNNNN)r)   r*   r+   r   _tied_weights_keysr!   r   r   r/   r0   r   r   r   r   r   r(   r1   r2   s   @r$   r  r  X  s         *+      -115/38<9=,07;+/04)-$(,0/3&*15!Y
 Y
EL)Y
 !.Y
 u|,	Y

  (5Y
 !) 6Y
 EL)Y
 'u|4Y
 "%Y
  -Y
 &Y
 D>Y
 $D>Y
 'tnY
 d^Y
  !.!Y
$ 
uU\"$EE	F%Y
 Y
 Y
 ^Y
 Y
 Y
 Y
 Y
r%   r  )r  r   r   )-r,   rI   typingr   r   r/   r   activationsr   cache_utilsr   r	   r
   
generationr   modeling_attn_mask_utilsr   r   modeling_layersr   modeling_outputsr   r   modeling_utilsr   utilsr   r   utils.deprecationr   configuration_xglmr   
get_loggerr)   r   r   r   Moduler4   rd   r   r   r   r  __all__r   r%   r$   <module>r/     s      " " " " " " " "        ! ! ! ! ! ! C C C C C C C C C C ) ) ) ) ) ) e e e e e e e e 9 9 9 9 9 9 l l l l l l l l - - - - - - , , , , , , , , 0 0 0 0 0 0 * * * * * * 
	H	%	%
= 
= 
= 
= 
=bl 
= 
= 
=1w 1w 1w 1w 1w	 1w 1w 1wh^2 ^2 ^2 ^2 ^2BI ^2 ^2 ^2Bo o o o o1 o o od ? ? ? ? ?/ ? ? ?$ F
 F
 F
 F
 F
# F
 F
 F
R   f
 f
 f
 f
 f
)? f
 f
 f
R B
A
Ar%   