
     `i                        d Z ddlZddlmZmZ ddlZddlmc mZ	 ddlmZ ddl
mZ ddlmZ ddlmZmZ dd	lmZ dd
lmZmZ ddlmZ ddlmZmZ ddlmZ  ej        e          Z G d dej                   Z! G d dej                   Z" G d dej                   Z# G d dej                   Z$ G d dej                   Z% G d dej                   Z& G d dej                   Z' G d dej                   Z( G d d ej                   Z) G d! d"ej                   Z* G d# d$ej                   Z+e G d% d&e                      Z,e G d' d(e,                      Z- ed)*           G d+ d,e,e                      Z.g d-Z/dS ).zPyTorch CPMAnt    N)OptionalUnion)nn)CrossEntropyLoss   )ACT2FN)CacheDynamicCache)GenerationMixin)BaseModelOutputWithPastCausalLMOutputWithPast)PreTrainedModel)auto_docstringlogging   )CpmAntConfigc                   >     e Zd ZdZdef fdZdej        fdZ xZ	S )CpmAntLayerNormz~
    We use Root Mean Square (RMS) Layer Normalization, please see https://huggingface.co/papers/1910.07467 for details."
    configc                     t                                                       |j        | _        |j        | _        t          j        t          j        |j                            | _	        d S N)
super__init__epshidden_sizedim_normr   	Parametertorchemptyweightselfr   	__class__s     ~/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/transformers/models/cpmant/modeling_cpmant.pyr   zCpmAntLayerNorm.__init__*   sN    :*l5;v/A#B#BCC    hidden_statesc                 p   |                     d          | j        k    rt          d          |j        }|                    t
          j                                      d                              dd          }|t          j	        || j
        z             z                      |          | j        z  }|S )f
        Args:
            hidden_states (`torch.Tensor` of shape `(batch, seq_len, dim_in)`)
        z'hidden_states.size(-1) != self.dim_norm   T)dimkeepdim)sizer   AssertionErrordtypetor   float32powmeanrsqrtr   r    )r"   r&   	old_dtypevariances       r$   forwardzCpmAntLayerNorm.forward1   s    
 b!!T]22 !JKKK!'	 ##EM2266q99>>2t>TT&X5H)I)IIMMiXX[_[ffr%   )
__name__
__module____qualname____doc__r   r   r   Tensorr7   __classcell__r#   s   @r$   r   r   %   sr         D| D D D D D D
U\ 
 
 
 
 
 
 
 
r%   r   c                        e Zd Zddef fdZ	 	 	 	 ddej        dej        dej        dej        d	ee	         d
ee
         dee	         deej                 fdZ xZS )CpmAntAttentionNr   c                    t                                                       |j        | _        |j        | _        |j        | _        || _        t          j	        | j        | j        | j        z  d          | _
        t          j	        | j        | j        | j        z  d          | _        t          j	        | j        | j        | j        z  d          | _        t          j	        | j        | j        z  | j        d          | _        t          j                            d          | _        |j        ,t          j                            |j                  | _        d S d | _        d S )NFbiasr)   r+   )p)r   r   r   	dim_modelnum_attention_heads	num_headsdim_head	layer_idxr   Linear	project_q	project_k	project_vattention_outr   Softmaxsoftmax	dropout_pDropoutdropoutr"   r   rJ   r#   s      r$   r   zCpmAntAttention.__init__?   s   +3"4>4>DM3QX]^^^4>4>DM3QX]^^^4>4>DM3QX]^^^Yt~'Et~\abbbx''B'//' 8++f.>+??DLLLDLLLr%   Fhidden_q	hidden_kvattention_maskposition_biasoutput_attentionspast_key_values	use_cachecache_positionc	           	         |                     d          }	|                     d          }
|                     d          }|                     |          }|                     |          }|                     |          }|                    |	|
| j        | j                                      dddd          }|                    |	|| j        | j                                      dddd          }|                    |	|| j        | j                                      dddd          }|7|                    ||| j	        d|i          \  }}|                     d          }t          j        ||                    dd                    t          j        | j                  z  }||z   }t          j        ||                    |	d|
|          t          j        d	          k    t          j        t%          d
          |j        |j                            }|                     |          }t          j        ||                    |	d|
|          t          j        d	          k    t          j        d|j        |j                            }|r|}nd}| j        |                     |          }t          j        ||          }|                    |	| j        |
| j                                      dddd          }|                                                    |	|
| j        | j        z            }|                     |          }||fS )ad  
        Args:
            hidden_q (`torch.Tensor`):
                Input of transformer block(self-attention block). It can be the raw embedding of a batch of sequences.
            hidden_kv (`torch.Tensor` of shape `(batch, len_k, dim_model)`)):
                Tensor *key_value* and *query* of shape `(batch, len_k, dim_model)`
            attention_mask (`torch.Tensor` of shape `(batch, len_seq, len_seq)`):
                Avoid invalid areas to participate in the calculation of self-attention.
            position_bias (`torch.Tensor` of shape `(batch, len_seq, len_seq)`):
                Provide positional information to self-attention block.
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers.
            past_key_values (`Cache`, *optional*):
                Cached past key and value projection states.
            use_cache (`bool`, *optional*):
                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
                (see `past_key_values`).
        r   r   r*   r   Nr]   r)   Fz-inf)devicer/   )r-   rL   rM   rN   viewrH   rI   permuteupdaterJ   r   matmul	transposemathsqrtmasked_filltensorscalar_tensorfloatr`   r/   rQ   rT   
contiguousrO   )r"   rV   rW   rX   rY   rZ   r[   r\   r]   
batch_sizelen_qlen_kquerykeyvaluescoreattn_weightss                    r$   r7   zCpmAntAttention.forwardS   s   : ]]1%%
a  q!!x((nnY''y))

:udndmLLTTUVXY[\^_``hhz5$.$-HHPPQRTUWXZ[\\

:udndmLLTTUVXY[\^_``&(//UDNM]_mLnooJCHHRLLE UCMM"b$9$9::TYt}=U=UU%!
Aue<<U@S@SSfel%+VVV
 

 U##!
Aue<<U@S@SS%,ekJJJ
 

  	  LLL<#LL''E UE**

:t~udmLLTTUVXY[\^_``  ""''
E4>DM;YZZ""5))l""r%   r   )FNNN)r8   r9   r:   r   r   r   r<   
BoolTensorr   boolr	   r7   r=   r>   s   @r$   r@   r@   >   s           |            4 -2+/$(15M# M#,M# <M# (	M#
 |M# $D>M# "%M# D>M# !.M# M# M# M# M# M# M# M#r%   r@   c                        e Zd Zddef fdZ	 	 	 	 	 ddej        dej        deej                 dee         d	ee	         d
ee         deej                 fdZ
 xZS )CpmAntSelfAttentionBlockNr   c                    t                                                       t          |          | _        t	          ||          | _        |j        r+t          j        	                    |j                  | _
        d S d | _
        d S NrJ   )r   r   r   layernorm_before_attentionr@   self_attentionrR   r   r   rS   rT   rU   s      r$   r   z!CpmAntSelfAttentionBlock.__init__   sr    *9&*A*A'-f	JJJ 	  8++F,<==DLLLDLLLr%   Fr&   rX   rY   rZ   r[   r\   r]   c           
          |                      |          }|                     ||||||||          \  }}	| j        |                     |          }||z   }||	fS )a  
        Args:
            hidden_states (`torch.Tensor` of shape `(batch, len_seq, dim_model)`):
                Input of transformer block(self-attention block). It can be the raw embedding of a batch of sequences.
            attention_mask (`torch.Tensor` of shape `(batch, len_seq, len_seq)`):
                Avoid invalid areas to participate in the calculation of self-attention.
            position_bias (`torch.Tensor` of shape `(batch, len_seq, len_seq)`):
                Provide positional information to self-attention block.
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers.
            past_key_values (`Cache`, *optional*):
                Cached past key and value projection states.
            use_cache (`bool`, *optional*):
                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
                (see `past_key_values`).
        )r|   r}   rT   )
r"   r&   rX   rY   rZ   r[   r\   r]   outputsrt   s
             r$   r7   z CpmAntSelfAttentionBlock.forward   sy    4 11-@@ $ 3 3	!
 	!
 <#ll7++G%/l**r%   r   NFNNNr8   r9   r:   r   r   r   r<   r   rv   r	   r7   r=   r>   s   @r$   rx   rx      s           |             15,1+/$(15*+ *+|*+ *+  -	*+
 $D>*+ "%*+ D>*+ !.*+ *+ *+ *+ *+ *+ *+ *+r%   rx   c                   :     e Zd Zdef fdZdej        fdZ xZS )CpmAntDenseGatedACTr   c                 &   t                                                       t          j        |j        |j        d          | _        t          j        |j        |j        d          | _        t          j        	                                | _
        d S NFrB   )r   r   r   rK   r   dim_ffw_0w_1r   GELUactr!   s     r$   r   zCpmAntDenseGatedACT.__init__   sh    9V/UKKK9V/UKKK8==??r%   r&   c                     |                      |                     |                    }|                     |          }||z  }|S )zTransform an input tensor from one feature space to another via a nonlinear operation

        Args:
            hidden_states (`torch.Tensor` of shape `(batch, seq_len, dim_in)`)
        )r   r   r   )r"   r&   
gate_scores      r$   r7   zCpmAntDenseGatedACT.forward   sB     XXdhh}5566
//"]2r%   	r8   r9   r:   r   r   r   r<   r7   r=   r>   s   @r$   r   r      sa        #| # # # # # #
U\ 
 
 
 
 
 
 
 
r%   r   c                   :     e Zd Zdef fdZdej        fdZ xZS )CpmAntFeedForwardr   c                 ,   t                                                       t          |          | _        |j        *t
          j                            |j                  | _        nd | _        t          j	        |j
        |j        d          | _        d S r   )r   r   r   w_inrR   r   r   rS   rT   rK   r   r   w_outr!   s     r$   r   zCpmAntFeedForward.__init__   su    '//	' 8++F,<==DLLDLYv}f.@uMMM


r%   r&   c                     |                      |          }| j        |                     |          }|                     |          }|S )r(   )r   rT   r   r"   r&   s     r$   r7   zCpmAntFeedForward.forward   sE    
 		-00<# LL77M

=11r%   r   r>   s   @r$   r   r      sh        N| N N N N N NU\        r%   r   c                   :     e Zd Zdef fdZdej        fdZ xZS )CpmAntFFNBlockr   c                 
   t                                                       t          |          | _        t	          |          | _        |j        r+t          j        	                    |j                  | _
        d S d | _
        d S r   )r   r   r   layernorm_before_ffnr   ffnrR   r   r   rS   rT   r!   s     r$   r   zCpmAntFFNBlock.__init__	  sl    $3F$;$;!$V,, 	  8++F,<==DLLLDLLLr%   r&   c                     |                      |          }|                     |          }| j        |                     |          }||z   }|S )z
        Args:
            hidden_states (`torch.Tensor` of shape `(batch, len_seq, dim_model)`):
                Hidden states before feed forward layer.
        )r   r   rT   )r"   r&   
ln_outputsr   s       r$   r7   zCpmAntFFNBlock.forward  sQ     ..}==
((:&&<#ll7++G%/r%   r   r>   s   @r$   r   r     sb         |            |       r%   r   c                        e Zd Zddef fdZ	 	 	 	 	 ddej        dej        deej                 dee         d	ee	         d
ee         deej                 fdZ
 xZS )CpmAntTransformerBlockNr   c                     t                                                       t          ||          | _        t	          |          | _        d S rz   )r   r   rx   self_attr   r   rU   s      r$   r   zCpmAntTransformerBlock.__init__$  s@    09MMM!&))r%   Fr&   rX   rY   rZ   r[   r\   r]   c           	      r    |                      |||||||          \  }}|                     |          }||fS )a  
        Args:
            hidden_states (`torch.Tensor`):
                Input to the layer of shape `(batch, seq_len, dim_model)`
            attention_mask (`torch.Tensor`):
                Avoid invalid areas to participate in the calculation of shape `(batch, seq_len, seq_len)`
            position_bias (`torch.Tensor`):
                Provides position information to attention mechanism of shape `(num_heads, seq_len, seq_len)`
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers.
            past_key_values (`Cache`, *optional*):
                Cached past key and value projection states
            use_cache (`bool`, *optional*):
                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
                (see `past_key_values`).
        )rX   rY   rZ   r[   r\   r]   )r   r   )	r"   r&   rX   rY   rZ   r[   r\   r]   rt   s	            r$   r7   zCpmAntTransformerBlock.forward)  sT    4 '+mm)'/+) '4 '
 '
#| //l**r%   r   r   r   r>   s   @r$   r   r   #  s        * *| * * * * * * 15,1+/$(15%+ %+|%+ %+  -	%+
 $D>%+ "%%+ D>%+ !.%+ %+ %+ %+ %+ %+ %+ %+r%   r   c                        e Zd Zdef fdZ	 	 	 	 	 ddej        dej        dej        dee         dee         d	ee	         d
ee         deej                 fdZ
 xZS )CpmAntEncoderr   c                     t                                                       j        | _        t	          j        fdt          | j                  D                       | _        t                    | _	        d S )Nc                 2    g | ]}t          |           S )r{   )r   ).0ir   s     r$   
<listcomp>z*CpmAntEncoder.__init__.<locals>.<listcomp>U  s(    $q$q$qUV%;Fa%P%P%P$q$q$qr%   )
r   r   num_hidden_layers
num_layersr   
ModuleListrangelayersr   output_layernormr!   s    `r$   r   zCpmAntEncoder.__init__R  sm     2m$q$q$q$qZ_`d`oZpZp$q$q$qrr / 7 7r%   Nr&   rX   rY   rZ   output_hidden_statesr[   r\   r]   c	           	          |rdnd}	|rdnd}
t          | j                  D ]+\  }}|r|	|fz  }	 |||||||          }|\  }}|r|
|fz  }
,|                     |          }|r|	|fz  }	||	|
fS )a  
        Args:
            hidden_states (`torch.Tensor`):
                Input to the layer of shape `(batch, seq_len, dim_model)`
            attention_mask (`torch.Tensor`):
                Avoid invalid areas to participate in the calculation of shape `(batch, seq_len, seq_len)`
            position_bias (`torch.Tensor`):
                Provides position information to attention mechanism of shape `(num_heads, seq_len, seq_len)`
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers.
            output_hidden_states (`bool`, *optional*):
                Whether or not to return the hidden states of all layers.
            past_key_values (`Cache`, *optional*):
                Cached past key and value projection states
            use_cache (`bool`, *optional*):
                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
                (see `past_key_values`).
         N)rZ   r[   r\   )	enumerater   r   )r"   r&   rX   rY   rZ   r   r[   r\   r]   all_hidden_statesall_self_attnsr   layerlayer_outputsrt   s                  r$   r7   zCpmAntEncoder.forwardY  s    : #7@BBD0:d!$+.. 	2 	2HAu# 6!m%55!!E"3 /#  M +8'M<  2</1--m<< 	2-!11/??r%   )NNNNNr   r>   s   @r$   r   r   Q  s        8| 8 8 8 8 8 8 -1/3+/$(154@ 4@|4@ 4@ |	4@
 $D>4@ 'tn4@ "%4@ D>4@ !.4@ 4@ 4@ 4@ 4@ 4@ 4@ 4@r%   r   c                   B     e Zd Z fdZdej        dej        fdZ xZS )CpmAntIntermediatec                    t                                                       t          j        |j        |j                  | _        t          |j        t                    rt          |j                 | _        d S |j        | _        d S r   )r   r   r   rK   r   intermediate_sizedense
isinstance
hidden_actstrr   intermediate_act_fnr!   s     r$   r   zCpmAntIntermediate.__init__  sn    Yv163KLL
f'-- 	9'-f.?'@D$$$'-'8D$$$r%   r&   returnc                 Z    |                      |          }|                     |          }|S r   )r   r   r   s     r$   r7   zCpmAntIntermediate.forward  s,    

=1100??r%   r8   r9   r:   r   r   r<   r7   r=   r>   s   @r$   r   r     s^        9 9 9 9 9U\ el        r%   r   c                   r     e Zd Zdef fdZdej        dej        dej        dej        fdZd ZddZ	 xZ
S )CpmAntSegmentPositionEmbeddingr   c                 4   t                                                       |j        | _        |j        | _        |j        | _        |j        | _	        t          j        t          j        |j        |j        z  |j        z   |j                            | _        d S r   )r   r   rG   rH   position_bias_num_bucketsnum_bucketsposition_bias_max_distancemax_distancesegment_typesnum_segmentsr   r   r   r   relative_attention_biasr!   s     r$   r   z'CpmAntSegmentPositionEmbedding.__init__  s    3!;"="0')|K$v';;f>^^* (
 (
$$$r%   key_pos	query_poskey_segmentquery_segmentc           	         t          j                    5  |                    d          }|                    d          }|                    d          }|                    d          |                    d          k    r<t          d|                    d           d|                    d           d          ||                    d          k    s||                    d          k    r)t          d| d|                    d           d          ||                    d          k    r)t          d| d|                    d           d          |                    |d|          }|                    ||d          }|                    |d|          }|                    ||d          }|                     ||          }|| j        z   }|                     t          j        |t           j	        |j
        	          d d d f         t          j        |t           j	        |j
        	          d d d f         z
  | j        | j        
          }	t          j        ||k    |	d d d d d f         |          }d d d            n# 1 swxY w Y   t          j        || j                  }
|
                    dddd                                          }
|
S )Nr   r   z>key_pos.size(0) should be equal to query_pos.size(0), but got z and !z7keylen should be equal to key_segment.size(1), but got z;querylen should be equal to query_segment.size(1), but got r)   r/   r`   )r   r   r   r*   )r   no_gradr-   r.   ra   !_segment_relative_position_bucketr   _position_bucketarangeint32r`   r   whereF	embeddingr   rb   rl   )r"   r   r   r   r   batchkeylenquerylenrelative_position_bucketabsolute_position_bucketembedss              r$   r7   z&CpmAntSegmentPositionEmbedding.forward  s8    ]__ %	 %	LLOOE\\!__F ~~a((H||A).."3"333$U\UaUabcUdUdktkykyz{k|k|   ))!,,,,M<N<Nq<Q<Q0Q0Q$qfqq[f[k[klm[n[nqqq   =--a0000$yRZyyanasastuavavyyy   ll5"f55G!uh;;I%**5"f==K)..uhCCM'+'M'Mm]h'i'i$'?$BR'R$ (,'<'<V5;?W?^___`dfgfgfg`gh,xu{C[Cbcccdededegkdklm ,!.	 (= ( ($ (-{-(qqq!!!4(( ($C%	 %	 %	 %	 %	 %	 %	 %	 %	 %	 %	 %	 %	 %	 %	P 5t7STT1a++6688s   I)J		JJc                     || j         z  |z   S r   )r   )r"   r   r   s      r$   r   z@CpmAntSegmentPositionEmbedding._segment_relative_position_bucket  s    t00;>>r%          c                 .   d}|dz  }|dk                         t          j                  |z  }t          j        |          }|dz  }||k     }|t          j        |                                |z            t          j        ||z            z  ||z
  z                       t          j                  z   }t          j        |t          j        ||dz
                      }|t          j	        ||                     t          j                  |          z  }|S )Nr   r*   r   )
r0   r   r   abslogrk   rf   min	full_liker   )r"   relative_positionr   r   relative_buckets	max_exactis_smallrelative_position_if_larges           r$   r   z/CpmAntSegmentPositionEmbedding._position_bucket  s   -155ekBB[P!I&7881$	$y0%.I'--//);<<h|i/001Y&( "U[//	&"
 &+Y&O6aHH&
 &
" 	EK2C2F2Fu{2S2SUopppr%   )r   r   )r8   r9   r:   r   r   r   r<   r7   r   r   r=   r>   s   @r$   r   r     s        
| 
 
 
 
 
 
22 <2 \	2
 |2 2 2 2h? ? ?               r%   r   c                   P     e Zd Z fdZdej        dej        dej        fdZ xZS )CpmAntOutputc                    t                                                       t          j        |j        |j                  | _        t          j        |j        |j                  | _        t          j	        |j
                  | _        d S )N)r   )r   r   r   rK   r   r   r   	LayerNormlayer_norm_epsrS   hidden_dropout_probrT   r!   s     r$   r   zCpmAntOutput.__init__  sf    Yv79KLL
f&8f>STTTz&"<==r%   r&   input_tensorr   c                     |                      |          }|                     |          }|                     ||z             }|S r   )r   rT   r   )r"   r&   r   s      r$   r7   zCpmAntOutput.forward  s@    

=11]33}|'CDDr%   r   r>   s   @r$   r   r     si        > > > > >U\  RWR^        r%   r   c                   $    e Zd ZU eed<   dZd ZdS )CpmAntPreTrainedModelr   cpmantc                 v   t          |t          j                  rT|j        j                            d| j        j                   |j         |j        j        	                                 dS dS t          |t          j
                  r_|j        j                            d| j        j                   |j        +|j        j        |j                 	                                 dS dS t          |t          j                  r?|j        j        	                                 |j        j                            d           dS t          |t                    r!|j        j                            d           dS t          |t                    r-|j        j                            d| j        j                   dS dS )zInitialize the weightsg        )r3   stdNg      ?)r   r   rK   r    datanormal_r   init_stdrC   zero_	Embeddingpadding_idxr   fill_r   r   r   )r"   modules     r$   _init_weightsz#CpmAntPreTrainedModel._init_weights  s   fbi(( 	\M&&CT[5I&JJJ{& &&((((( '&-- 
	\M&&CT[5I&JJJ!-"6#56<<>>>>> .--- 	\K""$$$M$$S)))))00 	\M$$S))))) >?? 	\*/77SdkFZ7[[[[[	\ 	\r%   N)r8   r9   r:   r   __annotations__base_model_prefixr  r   r%   r$   r   r     s<          \ \ \ \ \r%   r   c                       e Zd Zdef fdZd Zd Zd Ze	 	 	 	 	 	 	 dde	e
j                 de	e         d	e	e         d
e	e         de	e         de	e         de	e
j                 deee
j                 ef         fd            Z xZS )CpmAntModelr   c                    t                                          |           t          |          | _        t	          j        |j        |j                  | _        t	          j        |j	        |j
        |j        z  z   |j                  | _        t          |          | _        |j        | _        |j	        | _	        |                                  d S r   )r   r   r   encoderr   r   r   r   segment_embedding
vocab_sizeprompt_typesprompt_lengthinput_embeddingr   rY   	post_initr!   s     r$   r   zCpmAntModel.__init__%  s       $V,,!#f.BFDV!W!W!| 3f6J JJFL^ 
  
 <FCC#1 +r%   c                     | j         S r   r  r"   s    r$   get_input_embeddingsz CpmAntModel.get_input_embeddings2  s    ##r%   c                     || _         d S r   r  )r"   
embeddingskwargss      r$   set_input_embeddingsz CpmAntModel.set_input_embeddings5  s    )r%   c                    |                     d          }|                     d          }|j        }t          j        ||          t          j        ||                              dd          k    }|d d d d d f         |d d d d d f                                         |                    d||          z  z  }	|	|d d d d d f         |d d d d d f         k    z  }	t          j        t          t          || j	        z
                      d d d         |          d d d f         
                    |d          |d d d f         k     }
t          j        t          j        || j	        |                                          |
fd          }
|
                    ||d          |
                    |d|          z  |	z  }	|	S )Nr   r   )r`   r)   rD   )r-   r`   r   r   ra   logical_notri   listr   r  repeatcatonesrv   )r"   	input_idsspancontextlengthr   seqlenr`   directional_mask_2drX   mask_1ds              r$   _prepare_attention_maskz#CpmAntModel._prepare_attention_mask8  s   q!!""!#l6&AAAU\RXagEhEhEhEmEmnprsEtEtt D!!!,AAAqqq$J++--0C0H0HFTZ0[0[[
 (44
+;tAAAqqq$J?O+OP LeFT-?$?@@AA$$B$GPVWWWX\^_^_^_X_`gghmopqqQQQWo 	 )UZt/A&QQQVVXXZabhijjj eVQ77',,uaQW:X:XX[iir%   Nr  rZ   r   r[   r\   return_dictr]   r   c           
         ||n| j         j        }||n| j         j        }||n| j         j        }||n| j         j        }|j        t          j        k    r|                    t          j                  }|j        |j	        }
}	t          j
        |dk    dd                              |	|
          }|dk                        d                              |	|
          }t          j        t          j        | j        dz  | j        z   | j        dz  | j        z   |	|
                              |                    d          d          |fd          }|                                \  }}t          j        t          j        || j        |	|
          |fd          }t          j        ||fd|	|
          }t          j        ||	|
                              |d          }t          j        ||fd|	|
          }|r|t)          | j         	          }|rCt+          |t,                    r.t.                              d
           t)          j        |          }||                                nd}|                                }|                     |          }|                     |          }|dk    r|ddddddf         }||z   }|                     ||||          }|                     ||||          }|dd|dddf         }|dddd|dddf         }|dd|dddf         }|                      ||||||||          \  }}}|dk    rh|dd| j        dddf         }|+d}|D ]$}||dddd| j        d| j        df         fz  }%|}|#d}|D ]}||dd| j        dddf         fz  }|}|st-          d ||||fD                       S tC          ||||          S )ai  
        input_ids (`torch.Tensor` of shape `(batch_size, seq_len)`):
            Indices of input sequence tokens in the vocabulary.

            Indices can be obtained using [`CPMAntTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are input IDs?](../glossary#input-ids)
        Nr   r*   r   r)   r   r   rD   )r   zPassing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.58.0. You should pass an instance of `DynamicCache` instead, e.g. `past_key_values=DynamicCache.from_legacy_cache(past_key_values)`.r   c              3      K   | ]}||V  	d S r   r   )r   vs     r$   	<genexpr>z&CpmAntModel.forward.<locals>.<genexpr>  s1        bcbobobobobo r%   )last_hidden_stater[   r&   
attentions)"r   rZ   r   use_return_dictr\   r/   r   r   r0   r`   r   sumr  r   r  r  r  r-   zerosfullr
   r   tupleloggerwarning_oncefrom_legacy_cacheget_seq_lengthrl   r  r
  r%  rY   r	  r   )r"   r  rZ   r   r[   r\   r&  r]   r  r/   r`   segmentr!  r   
seq_lengthr   positionr  past_lengthr&   segment_statesrX   rY   r   all_attentionsnew_attentions	attentionnew_hidden_stateshidden_states                                r$   r7   zCpmAntModel.forwardJ  s   * 2C1N--TXT_Tq$8$D  $+Jj 	 &1%<kk$+B]!*!6IIDK<Q	 ?ek))!U[11I!)9v+i1na3366U66RRQ,##B''**v*FFI&*T_<&*T_<!	  
 &**A.. 
 
 
	 &NN,,z)U[0B%X^___ahiopqqq*eZ0!5PPP<
%GGGNNuVWXXz5*-qfMMM 	?0*$+>>>O 	NOU;; 	NU  
 +<_MMO:I:Uo44666[\((**	,,Y77//88!+AAArssAAAI6N%655iwPVWW**8XwPP';<<(:;%aaaKLL!!!&;<%aaaqqq&89;?<< 	<
 	<
8(. !)!!!T-?-A-A111*DEM)!#!/ e eI"yAAAt7I7K7KTM_MaMa1a'b&ddNN!/ ,$&!$5 U UL%,qqq$:L:N:NPQPQPQ7Q*R)TT%%$5! 	  )?<M~^      '+++%	
 
 
 	
r%   )NNNNNNN)r8   r9   r:   r   r   r  r  r%  r   r   r   r<   rv   r	   r   r1  r   r7   r=   r>   s   @r$   r  r  #  s@       |      $ $ $* * *  $  -1,0/3+/$(&*15p
 p
EL)p
 $D>p
 'tn	p

 "%p
 D>p
 d^p
 !.p
 
uU\"$;;	<p
 p
 p
 ^p
 p
 p
 p
 p
r%   r  zy
    The CPMAnt Model with a language modeling head on top (linear layer with weights tied to the input embeddings).
    )custom_introc                   4    e Zd ZdgZdef fdZe	 	 	 	 	 	 	 	 	 ddeej	                 dee
         dee         dee         d	ee         d
eej	                 dee         deej	                 deej	                 deeef         fd            Zd Zd Zd Z xZS )CpmAntForCausalLMzlm_head.weightr   c                    t                                          |           t          |          | _        t	          j        |j        |j        |j        |j	        z  z   d          | _
        |                                  d S r   )r   r   r  r   r   rK   r   r  r  r  lm_headr  r!   s     r$   r   zCpmAntForCausalLM.__init__  sz       !&)) y 1F4G&J^4^ ^ej
 
 
 	r%   Nr  r[   r\   rZ   r   labelsr&  rX   r]   r   c
           	         ||n| j         j        }|                     |||||||	          }|r|j        n|d         }|                     |          }d}|Tt                      } ||                    d|                    d                    |                    d                    }|s|f|dd         z   }||f|z   n|S t          |||j	        |j
        |j                  S )u<  
        input_ids (`torch.Tensor` of shape `(batch_size, seq_len)`):
            Indices of input sequence tokens in the vocabulary.

            Indices can be obtained using [`CPMAntTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are input IDs?](../glossary#input-ids)
        labels (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss.

        Example:

        Text Generation with CpmAntForCausalLM.
        ```python
        >>> from transformers import CPMAntTokenizer, CpmAntForCausalLM

        >>> texts = "今天天气不错，"
        >>> model = CpmAntForCausalLM.from_pretrained("openbmb/cpm-ant-10b")
        >>> tokenizer = CPMAntTokenizer.from_pretrained("openbmb/cpm-ant-10b")
        >>> input_ids = tokenizer(texts, return_tensors="pt")
        >>> outputs = model.generate(**input_ids)
        >>> output_texts = tokenizer.batch_decode(outputs)
        >>> print(output_texts)
        ['今天天气不错，阳光明媚，我和妈妈一起去超市买东西。\n在超市里，我看到了一个很好玩的玩具，它的名字叫“机器人”。它有一个圆圆的脑袋，两只圆圆的眼睛，还有一个圆圆的']
        ```
        Nr   r)   r   )losslogitsr[   r&   r,  )r   r-  r   r+  rD  r   ra   r-   r   r[   r&   r,  )r"   r  r[   r\   rZ   r   rE  r&  rX   r]   r  model_outputr&   rH  rG  	loss_funcoutputs                    r$   r7   zCpmAntForCausalLM.forward  s   R &1%<kk$+B]{{ 
 
 ;FZ66<XY?m,,(**I9V[[V[[__==v{{2OOD 	FYabb!11F)-)9TGf$$vE%(8&4#.
 
 
 	
r%   c                     | j         j        S r   r   r  r  s    r$   r  z&CpmAntForCausalLM.get_input_embeddings  s    {**r%   c                     || j         _        d S r   rM  )r"   r  s     r$   r  z&CpmAntForCausalLM.set_input_embeddings  s    &0###r%   c                 l    d |D             }|D ]$}|d         |         |d<   |d         |         |d<   %|S )Nc                 4    g | ]}|t          |          n|S r   )r  )r   eachs     r$   r   z4CpmAntForCausalLM._reorder_cache.<locals>.<listcomp>   s'    ```)94:::t```r%   r   r   r   )r"   r[   beam_idxkey_value_layers       r$   _reorder_cachez CpmAntForCausalLM._reorder_cache  sV    ``P_```. 	> 	>O!0!3H!=OA!0!3H!=OAr%   )	NNNNNNNNN)r8   r9   r:   _tied_weights_keysr   r   r   r   r   r<   r	   rv   r   r1  r   r7   r  r  rT  r=   r>   s   @r$   rB  rB    sk        ++|        -1+/$(,0/3)-&*1515F
 F
EL)F
 "%F
 D>	F

 $D>F
 'tnF
 &F
 d^F
 !.F
 !.F
 
u,,	-F
 F
 F
 ^F
P+ + +1 1 1      r%   rB  )rB  r  r   )0r;   rf   typingr   r   r   torch.nn.functionalr   
functionalr   torch.nnr   activationsr   cache_utilsr	   r
   
generationr   modeling_outputsr   r   modeling_utilsr   utilsr   r   configuration_cpmantr   
get_loggerr8   r2  Moduler   r@   rx   r   r   r   r   r   r   r   r   r   r  rB  __all__r   r%   r$   <module>rd     s      " " " " " " " "                 % % % % % % ! ! ! ! ! ! . . . . . . . . ) ) ) ) ) ) O O O O O O O O - - - - - - , , , , , , , , . . . . . . 
	H	%	%    bi   2b# b# b# b# b#bi b# b# b#J4+ 4+ 4+ 4+ 4+ry 4+ 4+ 4+n    ")   (    	   4    RY   6++ ++ ++ ++ ++RY ++ ++ ++\<@ <@ <@ <@ <@BI <@ <@ <@@       Y  Y  Y  Y  Y RY Y  Y  Y z    29    \ \ \ \ \O \ \ \. W
 W
 W
 W
 W
' W
 W
 W
t   
a a a a a- a a 
aH H
G
Gr%   