
     `i~                        d Z ddlmZmZ ddlZddlZddlmZ ddlm	Z	m
Z
mZ ddlmZmZ ddlmZ dd	lmZmZmZ dd
lmZ ddlmZmZmZ ddlmZmZ ddlmZ  ej         e!          Z"d Z#d Z$d#dZ% G d dej&                  Z'd Z( G d dej&                  Z)e G d de                      Z*e G d de*                      Z+ ed           G d de*e                      Z, ed           G d  d!e*                      Z-g d"Z.dS )$zPyTorch CTRL model.    )OptionalUnionN)nn)BCEWithLogitsLossCrossEntropyLossMSELoss   )CacheDynamicCache)GenerationMixin)BaseModelOutputWithPastCausalLMOutputWithPastSequenceClassifierOutput)PreTrainedModel)Conv1D find_pruneable_heads_and_indicesprune_linear_layer)auto_docstringlogging   )
CTRLConfigc                 N    dt          j        dd|dz  z  |z            z  }| |z  S )Nr   i'     )torchpow)posid_model_sizeangle_ratess       z/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/transformers/models/ctrl/modeling_ctrl.py
angle_defnr!   (   s0    eiQ!V'DEEEK    c                    t          t          j        | t          j                                      |                              d          t          j        |t          j                                      |                              d          |          }t          j        |d d dd df                   }t          j        |d d dd df                   }t          j        ||gd          }|S )Ndtyper   r   r   dim)	r!   r   arangeint64to	unsqueezesincoscat)positionr   r%   
angle_radssinescosinespos_encodings          r    positional_encodingr5   -   s    XU[11144U;;EEaHH\55588??II!LL J IjADqD)**Ei
111add7+,,G9eW-2666Lr"   c           	         t          j        | |                    dddd                    }|j        d         }|t	          j        |          z  }|A|                    d          |                    d          }
}	|||
|	z
  |
d |
f         dz  z  }|||z   }t          j        |d          }|||z  }t          j        ||          }||fS )	Nr   r   r	   r   r&   g     r'   )r   matmulpermuteshapenpsqrtsizesoftmax)qkvmaskattention_mask	head_mask	matmul_qkdkscaled_attention_logitsndnsattention_weightsoutputs                r    scaled_dot_product_attentionrL   <   s    Q		!Q1 5 566I	
B'"'"++5(--b113J3O3OPR3S3SB4R"crc(9#:T#AA!"9N"J&=2FFF -	9\+Q//F$$$r"   c                   @     e Zd Zd fd	Zd Zd Z	 	 	 	 	 	 ddZ xZS )	MultiHeadAttentionNc                    t                                                       || _        || _        || _        t          || j        z            | _        t          j        ||          | _	        t          j        ||          | _
        t          j        ||          | _        t          j        ||          | _        t                      | _        d S N)super__init__	num_headsr   	layer_idxintdepthr   LinearWqWkWvdensesetpruned_heads)selfr   rS   rT   	__class__s       r    rR   zMultiHeadAttention.__init__W   s    "("677
)L,77)L,77)L,77Y|\::
EEr"   c                    | j         | j        z  }t          |          dk    rd S t          || j        || j                  \  }}t          | j        |          | _        t          | j        |          | _        t          | j        |          | _        t          | j	        |d          | _	        | j        t          |          z
  | _        || j        z  | _         | j        
                    |          | _        d S )Nr   r   r'   )r   rS   lenr   r]   r   rX   rY   rZ   r[   union)r^   headsattention_head_sizeindexs       r    prune_headszMultiHeadAttention.prune_headsf   s    "/4>Au::??F7t~Obdhduvvu %TWe44$TWe44$TWe44'
EqAAA
 #e**4/$.@ -33E::r"   c                 t    |                     |d| j        | j                  }|                    g d          S )Nr&   r   r   r   r	   )reshaperS   rV   r9   )r^   x
batch_sizes      r    split_into_headsz#MultiHeadAttention.split_into_headsw   s3    IIj"dndjAAyy&&&r"   Fc                 B   |j         d         }|                     |          }|                     |          }|                     |          }|                     ||          }|                     ||          }|                     ||          }|"|                    ||| j        d|
i          \  }}t          ||||||          }|d                             g d          }|d         }|	                    |d| j
                  }|                     |          }||fS )Nr   cache_positionrh   r   r&   )r:   rX   rY   rZ   rl   updaterT   rL   r9   ri   r   r[   )r^   rA   r@   r?   rB   
layer_pastrC   rD   	use_cacheoutput_attentionsrn   rk   rK   scaled_attentionattnoriginal_size_attentions                   r    forwardzMultiHeadAttention.forward{   s    WQZ
GGAJJGGAJJGGAJJ!!!Z00!!!Z00!!!Z00!$$Q4><Ln;]^^DAq-aAt^YWW!!9,,\\\::ay"2":"::r4K\"]"]344t|r"   rP   NNNFFN)__name__
__module____qualname__rR   rf   rl   rv   __classcell__r_   s   @r    rN   rN   V   s        " " " " " "; ; ;"' ' '        r"   rN   c                     t          j        t          j        | |          t          j                    t          j        ||                     S rP   )r   
SequentialrW   ReLU)r   dffs     r    point_wise_feed_forward_networkr      s5    =<55rwyy")CQ]B^B^___r"   c                   4     e Zd Zd fd	Z	 	 	 	 	 	 ddZ xZS )EncoderLayer皙?Nc                 p   t                                                       t          |||          | _        t	          ||          | _        t          j        |d          | _        t          j        |d          | _	        t          j
        |          | _        t          j
        |          | _        d S )NrT   gư>eps)rQ   rR   rN   multi_head_attentionr   ffnr   	LayerNorm
layernorm1
layernorm2Dropoutdropout1dropout2)r^   r   rS   r   raterT   r_   s         r    rR   zEncoderLayer.__init__   s    $6|YZc$d$d$d!2<EE,|>>>,|>>>
4((
4((r"   Fc	                 V   |                      |          }	|                     |	|	|	|||||||
  
        }
|
d         }|                     |          }||z   }|                     |          }|                     |          }|                     |          }||z   }|f|
dd          z   }|S )Nrp   rC   rD   rq   rr   rn   r   r   )r   r   r   r   r   r   )r^   rj   rB   rp   rC   rD   rq   rr   rn   normedattn_outputsattn_outputout1out2
ffn_outputoutputss                   r    rv   zEncoderLayer.forward   s     ##00!)/) 1 
 
 #1ommK00;t$$XXd^^
]]:..
j 'L,,r"   )r   Nrw   )rx   ry   rz   rR   rv   r{   r|   s   @r    r   r      se        
) 
) 
) 
) 
) 
)  " " " " " " " "r"   r   c                   $    e Zd ZU eed<   dZd ZdS )CTRLPreTrainedModelconfigtransformerc                    t          |t          j        t          f          rT|j        j                            d| j        j                   |j	         |j	        j        
                                 dS dS t          |t          j                  r_|j        j                            d| j        j                   |j        +|j        j        |j                 
                                 dS dS t          |t          j                  r?|j	        j        
                                 |j        j                            d           dS dS )zInitialize the weights.g        )meanstdN      ?)
isinstancer   rW   r   weightdatanormal_r   initializer_rangebiaszero_	Embeddingpadding_idxr   fill_)r^   modules     r    _init_weightsz!CTRLPreTrainedModel._init_weights   s.   fry&122 	* M&&CT[5R&SSS{& &&((((( '&-- 	*M&&CT[5R&SSS!-"6#56<<>>>>> .--- 	*K""$$$M$$S)))))	* 	*r"   N)rx   ry   rz   r   __annotations__base_model_prefixr    r"   r    r   r      s7         %* * * * *r"   r   c                       e Zd Z fdZd Zd Zd Ze	 	 	 	 	 	 	 	 	 	 	 	 ddee	j
                 dee         dee	j                 d	ee	j
                 d
ee	j
                 dee	j                 dee	j                 dee         dee         dee         dee         dee	j                 deee	j                 ef         fd            Z xZS )	CTRLModelc                 4   t                                                     j        | _        j        | _        t          j        | j        t          j	                  | _
        t          j        j        j                  | _        t          j        j                  | _        t          j        fdt'          j                  D                       | _        t          j        j        j                  | _        |                                  d S )Nc           	      `    g | ]*}t          j        j        j        j        |           +S )r   )r   n_embdn_headr   resid_pdrop).0r   r   s     r    
<listcomp>z&CTRLModel.__init__.<locals>.<listcomp>   sF        V]FM6:vGYefggg  r"   r   )rQ   rR   r   r   n_layer
num_layersr5   n_positionsr   floatr4   r   r   
vocab_sizewr   
embd_pdropdropout
ModuleListrangehr   layer_norm_epsilon	layernorm	post_initr^   r   r_   s    `r    rR   zCTRLModel.__init__   s       "M ./0BDDUW\Wbccf/??z&"344   v~..  
 
 fm9RSSS 	r"   c                     | j         S rP   r   )r^   s    r    get_input_embeddingszCTRLModel.get_input_embeddings  s	    vr"   c                     || _         d S rP   r   )r^   new_embeddingss     r    set_input_embeddingszCTRLModel.set_input_embeddings  s    r"   c                     |                                 D ]*\  }}| j        |         j                            |           +dS )zv
        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer}
        N)itemsr   r   rf   )r^   heads_to_prunelayerrc   s       r    _prune_headszCTRLModel._prune_heads  sQ     +0022 	B 	BLE5F5M.::5AAAA	B 	Br"   N	input_idspast_key_valuesrC   token_type_idsposition_idsrD   inputs_embedsrq   rr   output_hidden_statesreturn_dictrn   returnc                 P   |	|	n| j         j        }	||n| j         j        }|
|
n| j         j        }
||n| j         j        }||t          d          |T|                     ||           |                                }|                    d|d                   }|j	        d         }n;|*|                                dd         }|j	        d         }nt          d          ||j
        n|j
        }|r|t          | j                   }|rCt          |t                    r.t                              d           t          j        |          }||                                nd}|@t%          j        ||d         |z   t$          j        |          }|                    d          }||dk    rt          d	          |                    |d          }|                    d
                              d          }|                    | j                  }d|z
  t%          j        | j                  j        z  }|                     || j         j                  }|N|                    d|d                   }|                     |          }|t;          j        | j                  z  }nd}||                     |          }|d         }t%          j         t%          j!        ||z   ||z             d
                              |          }|t;          j        | j                  z  }| j"                            |          | _"        | j"        |ddf         }||z   |z   }| #                    |          }|
rdnd}|	rdnd}tI          | j%                  D ]<\  }}|
r||fz   } |||||||         ||	|          }|d         }|	r||d
         fz  }=| &                    |          }|
r||fz   }|st          d ||||fD                       S tO          ||||          S )aE  
        input_ids (`torch.LongTensor` of shape `(batch_size, input_ids_length)`):
            `input_ids_length` = `sequence_length` if `past_key_values` is `None` else `past_key_values[0].shape[-2]`
            (`sequence_length` of input past key value states). Indices of input sequence tokens in the vocabulary.

            If `past_key_values` is used, only input IDs that do not have their past calculated should be passed as
            `input_ids`.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.__call__`] and
            [`PreTrainedTokenizer.encode`] for details.

            [What are input IDs?](../glossary#input-ids)

        Example:

        ```python
        >>> from transformers import AutoTokenizer, CTRLModel
        >>> import torch

        >>> tokenizer = AutoTokenizer.from_pretrained("Salesforce/ctrl")
        >>> model = CTRLModel.from_pretrained("Salesforce/ctrl")

        >>> # CTRL was trained with control codes as the first token
        >>> inputs = tokenizer("Opinion My dog is cute", return_tensors="pt")
        >>> assert inputs["input_ids"][0, 0].item() in tokenizer.control_codes.values()

        >>> outputs = model(**inputs)

        >>> last_hidden_states = outputs.last_hidden_state
        >>> list(last_hidden_states.shape)
        [1, 5, 1280]
        ```NzDYou cannot specify both input_ids and inputs_embeds at the same timer&   r   z5You have to specify either input_ids or inputs_embeds)r   zPassing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.58.0. You should pass an instance of `DynamicCache` instead, e.g. `past_key_values=DynamicCache.from_legacy_cache(past_key_values)`.)r%   devicez$batch_size has to be defined and > 0r   r   r$   r   r   r   c              3      K   | ]}||V  	d S rP   r   )r   rA   s     r    	<genexpr>z$CTRLModel.forward.<locals>.<genexpr>  s1        bcbobobobobo r"   )last_hidden_stater   hidden_states
attentions)(r   rr   rq   r   use_return_dict
ValueError%warn_if_padding_and_no_attention_maskr=   viewr:   r   r   r   tupleloggerwarning_oncefrom_legacy_cacheget_seq_lengthr   r)   longr,   r+   r%   finfominget_head_maskr   r   r;   r<   r   triuonesr4   r   	enumerater   r   r   )r^   r   r   rC   r   r   rD   r   rq   rr   r   r   rn   kwargsinput_shaperk   r   past_lengthtoken_type_embedsseq_lenrB   
pos_embedsr   all_hidden_statesall_attentionsr   r   r   s                               r    rv   zCTRLModel.forward  s   b 2C1N--TXT_Tq!*!6IIDK<Q	$8$D  $+Jj 	 &1%<kk$+B] ]%>cddd"66y.QQQ#..**K!r;r?;;I"+JJ&',,..ss3K&,Q/JJTUUU%.%:!!@T 	?0*$+>>>O 	NOU;; 	NU  
 +<_MMO:I:Uo44666[\ <[_{5RZ_ZdmstttL'11!44L %Q !GHHH+00R@@N ,55a88BB1EEN ,..TZ.@@N!N2ek$*6M6M6QQN &&y$+2EFF	%+00[_EEN $~ 6 6):!;!;; !  FF9--Mb/z%*W{%:Gk<QRRTUVVYYZ`aa!2333 !-0088&|QQQ7
%
25FF]33"6@BBD0:ddf%% 	0 	0DAq# I$58H$H!a*-#A,#"3-	 	 	G $AJM  071:-/}55 	E 1]4D D 	  )?<M~^      '+++%	
 
 
 	
r"   NNNNNNNNNNNN)rx   ry   rz   rR   r   r   r   r   r   r   
LongTensorr
   FloatTensorboolTensorr   r   r   rv   r{   r|   s   @r    r   r      s           ,       B B B  15+/6:59371559$(,0/3&*15d
 d
E,-d
 "%d
 !!23	d

 !!12d
 u/0d
 E-.d
   12d
 D>d
 $D>d
 'tnd
 d^d
 !.d
 
uU\"$;;	<d
 d
 d
 ^d
 d
 d
 d
 d
r"   r   z
    The CTRL Model transformer with a language modeling head on top (linear layer with weights tied to the input
    embeddings).
    )custom_introc                        e Zd ZdgZ fdZe	 	 	 	 	 	 	 	 	 	 	 	 	 ddeej                 dee	         deej
                 deej                 deej                 d	eej
                 d
eej
                 deej                 dee         dee         dee         dee         deej                 deeej                 ef         fd            ZddZ xZS )CTRLLMHeadModelzlm_head.weightc                     t                                          |           t          |          | _        t	          j        |j        |j        d          | _        | 	                                 d S )NTr   )
rQ   rR   r   r   r   rW   r   r   lm_headr   r   s     r    rR   zCTRLLMHeadModel.__init__  s`       $V,,y0AMMM 	r"   Nr   r   rC   r   r   rD   r   labelsrq   rr   r   r   rn   r   c                 V   ||n| j         j        }|                     ||||||||	|
|||          }|d         }|                     |          }d}| | j        ||fd| j         j        i|}|s|f|dd         z   }||f|z   n|S t          |||j        |j        |j	                  S )a
  
        input_ids (`torch.LongTensor` of shape `(batch_size, input_ids_length)`):
            `input_ids_length` = `sequence_length` if `past_key_values` is `None` else `past_key_values[0].shape[-2]`
            (`sequence_length` of input past key value states). Indices of input sequence tokens in the vocabulary.

            If `past_key_values` is used, only input IDs that do not have their past calculated should be passed as
            `input_ids`.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.__call__`] and
            [`PreTrainedTokenizer.encode`] for details.

            [What are input IDs?](../glossary#input-ids)
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for language modeling. Note that the labels **are shifted** inside the model, i.e. you can set
            `labels = input_ids` Indices are selected in `[-100, 0, ..., config.vocab_size]` All labels set to `-100`
            are ignored (masked), the loss is only computed for labels in `[0, ..., config.vocab_size]`

        Example:

        ```python
        >>> import torch
        >>> from transformers import AutoTokenizer, CTRLLMHeadModel

        >>> tokenizer = AutoTokenizer.from_pretrained("Salesforce/ctrl")
        >>> model = CTRLLMHeadModel.from_pretrained("Salesforce/ctrl")

        >>> # CTRL was trained with control codes as the first token
        >>> inputs = tokenizer("Wikipedia The llama is", return_tensors="pt")
        >>> assert inputs["input_ids"][0, 0].item() in tokenizer.control_codes.values()

        >>> sequence_ids = model.generate(inputs["input_ids"])
        >>> sequences = tokenizer.batch_decode(sequence_ids)
        >>> sequences
        ['Wikipedia The llama is a member of the family Bovidae. It is native to the Andes of Peru,']

        >>> outputs = model(**inputs, labels=inputs["input_ids"])
        >>> round(outputs.loss.item(), 2)
        9.21

        >>> list(outputs.logits.shape)
        [1, 5, 246534]
        ```N)r   rC   r   r   rD   r   rq   rr   r   r   rn   r   r   r   )losslogitsr   r   r   )
r   r   r   r  loss_functionr   r   r   r   r   )r^   r   r   rC   r   r   rD   r   r  rq   rr   r   r   rn   r   transformer_outputsr   	lm_logitsr	  rK   s                       r    rv   zCTRLLMHeadModel.forward  s   x &1%<kk$+B]"..+))%'/!5#) / 
 
 ,A.LL//	%4%   ;1 	 D  	F\$7$;;F)-)9TGf$$vE%/?-;*5
 
 
 	
r"   c                 :   |F|                                 }|j        d         |k    r|}n|j        d         dz
  }|d d |d f         }|||d}|                    dd            |                                D ]!\  }}	||vrt	          d| d           |	||<   "|S )Nr   )r   r   rq   r   z	Warning: z is not a recognized input.)r   r:   popr   print)
r^   r   r   rq   r   r   remove_prefix_lengthmodel_inputskeyvalues
             r    prepare_inputs_for_generationz-CTRLLMHeadModel.prepare_inputs_for_generation-  s     &)88::K q!K//'2$$ (1q'9A'=$!!!!%9%:%:":;I%.?ajkk 	

#T*** ,,.. 	* 	*JC,&&B#BBBCCC$)S!r"   )NNNNNNNNNNNNNNN)rx   ry   rz   _tied_weights_keysrR   r   r   r   r   r
   r   r   r   r   r   r   rv   r  r{   r|   s   @r    r  r    s        ++      15+/6:59371559-1$(,0/3&*15c
 c
E,-c
 "%c
 !!23	c

 !!12c
 u/0c
 E-.c
   12c
 )*c
 D>c
 $D>c
 'tnc
 d^c
 !.c
  
uU\"$::	;!c
 c
 c
 ^c
J       r"   r  a  
    The CTRL Model transformer with a sequence classification head on top (linear layer).
    [`CTRLForSequenceClassification`] uses the last token in order to do the classification, as other causal models
    (e.g. GPT-2) do. Since it does classification on the last token, it requires to know the position of the last
    token. If a `pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in
    each row. If no `pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot
    guess the padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last
    value in each row of the batch).
    c                       e Zd Z fdZe	 	 	 	 	 	 	 	 	 	 	 	 ddeej                 dee         deej	                 deej                 deej                 deej	                 d	eej	                 d
eej                 dee
         dee
         dee
         dee
         deeej                 ef         fd            Z xZS )CTRLForSequenceClassificationc                     t                                          |           |j        | _        t          |          | _        t          j        |j        | j        d          | _        | 	                                 d S )NFr  )
rQ   rR   
num_labelsr   r   r   rW   r   
classifierr   r   s     r    rR   z&CTRLForSequenceClassification.__init__V  si        +$V,,)FM4?OOO 	r"   Nr   r   rC   r   r   rD   r   r  rq   rr   r   r   r   c                    ||n| j         j        }|                     ||||||||	|
||          }|d         }|                     |          }||j        dd         \  }}n|j        dd         \  }}| j         j        |dk    rt          d          | j         j        d}n|}|| j         j        k                        |j        t          j
                  }t          j        |j        d         |j        t          j
                  }||z                      d          }n)d}t                              | j        j         d	           |t          j        ||j        
          |f         }d}|Z| j         j        f| j        dk    rd| j         _        nN| j        dk    r7|j        t          j        k    s|j        t          j        k    rd| j         _        nd| j         _        | j         j        dk    rWt-                      }| j        dk    r1 ||                                |                                          }n |||          }n| j         j        dk    rGt1                      } ||                    d| j                  |                    d                    }n*| j         j        dk    rt5                      } |||          }|s|f|dd         z   }||f|z   n|S t7          |||j        |j                  S )a2  
        input_ids (`torch.LongTensor` of shape `(batch_size, input_ids_length)`):
            `input_ids_length` = `sequence_length` if `past_key_values` is `None` else `past_key_values[0].shape[-2]`
            (`sequence_length` of input past key value states). Indices of input sequence tokens in the vocabulary.

            If `past_key_values` is used, only input IDs that do not have their past calculated should be passed as
            `input_ids`.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.__call__`] and
            [`PreTrainedTokenizer.encode`] for details.

            [What are input IDs?](../glossary#input-ids)
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).

        Example of single-label classification:

        ```python
        >>> import torch
        >>> from transformers import AutoTokenizer, CTRLForSequenceClassification

        >>> tokenizer = AutoTokenizer.from_pretrained("Salesforce/ctrl")
        >>> model = CTRLForSequenceClassification.from_pretrained("Salesforce/ctrl")

        >>> # CTRL was trained with control codes as the first token
        >>> inputs = tokenizer("Opinion My dog is cute", return_tensors="pt")
        >>> assert inputs["input_ids"][0, 0].item() in tokenizer.control_codes.values()

        >>> with torch.no_grad():
        ...     logits = model(**inputs).logits

        >>> predicted_class_id = logits.argmax().item()
        >>> model.config.id2label[predicted_class_id]
        'LABEL_0'
        ```

        ```python
        >>> import torch

        >>> torch.manual_seed(42)  # doctest: +IGNORE_RESULT
        >>> # To train a model on `num_labels` classes, you can pass `num_labels=num_labels` to `.from_pretrained(...)`
        >>> num_labels = len(model.config.id2label)
        >>> model = CTRLForSequenceClassification.from_pretrained("Salesforce/ctrl", num_labels=num_labels)

        >>> labels = torch.tensor(1)
        >>> loss = model(**inputs, labels=labels).loss
        >>> round(loss.item(), 2)
        0.93
        ```

        Example of multi-label classification:

        ```python
        >>> import torch
        >>> from transformers import AutoTokenizer, CTRLForSequenceClassification

        >>> tokenizer = AutoTokenizer.from_pretrained("Salesforce/ctrl")
        >>> model = CTRLForSequenceClassification.from_pretrained(
        ...     "Salesforce/ctrl", problem_type="multi_label_classification"
        ... )

        >>> # CTRL was trained with control codes as the first token
        >>> inputs = tokenizer("Opinion My dog is cute", return_tensors="pt")
        >>> assert inputs["input_ids"][0, 0].item() in tokenizer.control_codes.values()

        >>> with torch.no_grad():
        ...     logits = model(**inputs).logits

        >>> predicted_class_id = logits.argmax().item()
        >>> model.config.id2label[predicted_class_id]
        'LABEL_0'
        ```

        ```python
        >>> # To train a model on `num_labels` classes, you can pass `num_labels=num_labels` to `.from_pretrained(...)`
        >>> num_labels = len(model.config.id2label)
        >>> model = CTRLForSequenceClassification.from_pretrained("Salesforce/ctrl", num_labels=num_labels)

        >>> num_labels = len(model.config.id2label)
        >>> labels = torch.nn.functional.one_hot(torch.tensor([predicted_class_id]), num_classes=num_labels).to(
        ...     torch.float
        ... )
        >>> loss = model(**inputs, labels=labels).loss
        >>> loss.backward()  # doctest: +IGNORE_RESULT
        ```N)
r   rC   r   r   rD   r   rq   rr   r   r   r   r   r   z=Cannot handle batch sizes > 1 if no padding token is defined.r&   )r   r%   z will not detect padding tokens in `inputs_embeds`. Results may be unexpected if using padding tokens in conjunction with `inputs_embeds.`)r   
regressionsingle_label_classificationmulti_label_classification)r	  r
  r   r   )r   r   r   r  r:   pad_token_idr   r+   r   r   int32r)   argmaxr   r   r_   rx   problem_typer  r%   r   rU   r   squeezer   r   r   r   r   r   )r^   r   r   rC   r   r   rD   r   r  rq   rr   r   r   r  r   r
  rk   sequence_lengthlast_non_pad_tokennon_pad_masktoken_indicespooled_logitsr	  loss_fctrK   s                            r    rv   z%CTRLForSequenceClassification.forward_  sL   P &1%<kk$+B]"..+))%'/!5# / 
 
 ,A.// *3/"1"*='J*7*=bqb*A'J;#+
a\]]];#+!#"%)AAEEfmUZU`aaL!L)<V]Z_ZefffM"/,">!F!Fr!J!J!#>* Z Z Z  
 u|Jv}MMMOaab{'/?a''/;DK,,_q((flej.H.HFL\a\eLeLe/LDK,,/KDK,{'<77"99?a''#8M$9$9$;$;V^^=M=MNNDD#8M6::DD)-JJJ+--x 2 22t G GUWYY)-III,..xv66 	F#%(;ABB(??F)-)9TGf$$vE' -;*5	
 
 
 	
r"   r   )rx   ry   rz   rR   r   r   r   r   r
   r   r   r   r   r   r   rv   r{   r|   s   @r    r  r  J  sw             15+/6:59371559-1$(,0/3&*p
 p
E,-p
 "%p
 !!23	p

 !!12p
 u/0p
 E-.p
   12p
 )*p
 D>p
 $D>p
 'tnp
 d^p
 
uU\"$<<	=p
 p
 p
 ^p
 p
 p
 p
 p
r"   r  )r  r  r   r   r  )/__doc__typingr   r   numpyr;   r   r   torch.nnr   r   r   cache_utilsr
   r   
generationr   modeling_outputsr   r   r   modeling_utilsr   pytorch_utilsr   r   r   utilsr   r   configuration_ctrlr   
get_loggerrx   r   r!   r5   rL   ModulerN   r   r   r   r   r  r  __all__r   r"   r    <module>r:     s      " " " " " " " "            A A A A A A A A A A . . . . . . . . ) ) ) ) ) ) i i i i i i i i i i - - - - - - Y Y Y Y Y Y Y Y Y Y        + * * * * * 
	H	%	%  
  % % % %4D D D D D D D DN` ` `/ / / / /29 / / /d * * * * */ * * ** I
 I
 I
 I
 I
# I
 I
 I
X   K K K K K)? K K K\ 
 
 
{
 {
 {
 {
 {
$7 {
 {

 
{
| c
b
br"   