
     `i$                     p   d Z ddlZddlZddlmZmZmZ ddlZddlmZ ddl	m
Z
 ddlmZ ddlmZmZmZ dd	lmZ dd
lmZ ddlmZmZmZ ddlmZ ddlmZmZmZ ddlm Z m!Z!m"Z" ddl#m$Z$  e!j%        e&          Z'd Z( G d dej)                  Z* G d dej)                  Z+ G d dej)                  Z, G d de          Z-e  G d de                      Z.e  G d de.                      Z/ e d           G d  d!e.e                      Z0 e d"           G d# d$e.                      Z1g d%Z2dS )&zPyTorch OpenAI ImageGPT model.    N)AnyOptionalUnion)nn)CrossEntropyLoss   )ACT2FN)CacheDynamicCacheEncoderDecoderCache)GenerationMixin)GradientCheckpointingLayer))BaseModelOutputWithPastAndCrossAttentions!CausalLMOutputWithCrossAttentions SequenceClassifierOutputWithPast)PreTrainedModel)Conv1D find_pruneable_heads_and_indicesprune_conv1d_layer)auto_docstringloggingtorch_float   )ImageGPTConfigc                    	 ddl }ddl}n)# t          $ r t                              d            w xY wt
          j                            |          }t                              d|            |j	        
                    |          }g }g }|D ]|\  }	}
t                              d|	 d|
            |j	                            ||	          }|                    |	           |                    |                                           }t          ||          D ]=\  }	}|	dd         }	|	                    d          }	t!          d	 |	D                       s
|	d
         dv rAt                              d                    d                    |	                               | }|	d
         dvrt'          |d          }|	D ]}|                    d|          r|                    d|          }n|g}|d         dk    s|d         dk    rt'          |d          }nJ|d         dk    rt'          |d          }n,|d         dk    s|d         dk    r't'          ||d                   }t'          |d          }n|d         dv r!t'          |d          }t'          |d          }nt+          |	          dk    r?|	d         dk    r3|d         dk    r't'          ||d                   }t'          |d          }np|d         dk    r!t'          |d          }t'          |d          }nC|d         d k    r!t'          |d          }t'          |d          }nt'          ||d                   }t+          |          d!k    rt-          |d                   }||         }t+          |	          dk    r|	d         dk    s$|	d
         dk    s|	d
         d k    s|	d
         dk    rnC	 |j        |j        k    sJ n/# t0          $ r"}|xj        |j        |j        fz  c_         d}~ww xY wt                              d"|	            |	d
         d#k    rLt5          j        |                    |j        |j                            j        |j        ddd|j        f<   ||	d
         d$k    rTt5          j        |                    |j        |j                            j        |j        dd|j        d!|j        z  f<   |	d
         d%k    rOt5          j        |                    |j        |j                            j        |j        ddd!|j        z  df<   7t+          |	          dk    rQ|	d         dk    rE|	d!         dk    r9t5          j        |                    |j        |j                            |_        |	d
         dk    rt5          j        |          |_        |	d
         dk    r,t5          j        |          |j        d|j         dz
  ddf<   |	d
         d k    rt5          j        |          |j        d
<   $t5          j        |          |_        ?| S )&z0
    Load tf checkpoints in a pytorch model
    r   NzLoading a TensorFlow model in PyTorch, requires TensorFlow to be installed. Please see https://www.tensorflow.org/install/ for installation instructions.z&Converting TensorFlow checkpoint from zLoading TF weight z with shape    /c              3      K   | ]}|d v V  	dS ))adam_vadam_mAdamWeightDecayOptimizerAdamWeightDecayOptimizer_1global_stepN ).0ns     /home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/transformers/models/imagegpt/modeling_imagegpt.py	<genexpr>z.load_tf_weights_in_imagegpt.<locals>.<genexpr>P   s<       
 
 nn
 
 
 
 
 
    )_stepzSkipping {})wtettransformerz[A-Za-z]+\d+z(\d+)wgweightbbiaswpewte)q_projk_projv_projc_attnr   r   attnc_projr,   lm_headsos   zInitialize PyTorch weight r5   r6   r7   )!re
tensorflowImportErrorloggererrorospathabspathinfotrainlist_variablesload_variableappendsqueezezipsplitanyformatjoingetattr	fullmatchlenintshapeAssertionErrorargstorch
from_numpyreshapen_embdTdata
vocab_size)modelconfigimagegpt_checkpoint_pathr>   tftf_path	init_varsnamesarraysnamerU   arraypointerm_namescope_namesnumes                    r'   load_tf_weights_in_imagegptrn   /   s   				   Q	
 	
 	
 	 goo677G
KKBBBCCC''00IEF  ' 'eBBB5BBCCC&&w55Temmoo&&&&5&)) L3 L3eABBxzz#  
 

 
 
 
 
 	 "X""KK,,SXXd^^<<===88##g}55G 	' 	'F||OV44 ' hhx88%h1~$$A#(=(=!'844Q3&&!'622Q5((KNe,C,C!';q>::!'844Q#AAA!'844!'844TaDGv$5$5+a.H:T:T!';q>::!'844Q6))!'955!'844Q5((!'511!'844!';q>::;1$$+a.))!#,t99q==T!W..$r(f2D2DRTYHYHY]abd]ein]n]n}33333!   7=%+66 	7778888x/4/?fm]c]j@k@k/l/l/nGLOfmO+,,"X!!AFAQfmV];;B B LFMA,===>> "X!!383CEMMRXR_aganDoDo3p3p3rGLA-///00YY!^^Q6 1 1d1g6I6I +EMM&-,W,WXXGLL"X +E22GLL"X7<7G7N7NGL06,q00!!!344"X$/66GL +E22GLLLs     &1"O55
P!?PP!c                   Z     e Zd Zddee         def fdZdej        dej        fdZ	 xZ
S )	ImageGPTLayerNormh㈵>hidden_sizeepsc                     t                                                       || _        t          j        t          j        |                    | _        d S N)super__init__rs   r   	ParameterrX   Tensorr0   )selfrr   rs   	__class__s      r'   rw   zImageGPTLayerNorm.__init__   s?    l5<#<#<==r)   tensorreturnc                     |t          j        t          j        t          j        |          dd          | j        z             z  }|| j        z  }|S )Nr*   T)axiskeepdim)rX   sqrtmeansquarers   r0   )rz   r|   s     r'   forwardzImageGPTLayerNorm.forward   sL    %*UZV0D0D2W[%\%\%\_c_g%ghhh$+%r)   )rq   )__name__
__module____qualname__tuplerT   floatrw   rX   ry   r   __classcell__r{   s   @r'   rp   rp      sz        > >E#J >U > > > > > >
el u|        r)   rp   c                   B    e Zd Zddee         dee         f fdZd ZddZddZ	d	 Z
d
 Z	 	 	 	 	 	 	 	 ddej        dee         deej                 deej                 deej                 deej                 dee         dee         deej                 defdZ xZS )ImageGPTAttentionFNis_cross_attention	layer_idxc           	      `   t                                                       |j        }|                     dt	          j        t	          j        ||ft          j                                                dd||          d           |                     dt	          j	        d          d           |j
        | _        |j        | _        | j        | j        z  | _        | j        | _        | j        | j        z  | j        k    r t!          d| j         d	| j         d
          |j        | _        || _        |j        | _        || _        |j        | _        | j        rBt-          d| j        z  | j                  | _        t-          | j        | j                  | _        n"t-          d| j        z  | j                  | _        t-          | j        | j                  | _        t5          j        |j                  | _        t5          j        |j                  | _        tA                      | _!        d S )Nr2   dtyper   F)
persistentmasked_biasg     z=`embed_dim` must be divisible by num_heads (got `embed_dim`: z and `num_heads`: z).r=   r   )"rv   rw   max_position_embeddingsregister_bufferrX   trilonesboolviewr|   rr   	embed_dimnum_attention_heads	num_headshead_dim
split_size
ValueErrorscale_attn_weightsr   scale_attn_by_inverse_layer_idxr   reorder_and_upcast_attnr   r8   q_attnr:   r   Dropout
attn_pdropattn_dropoutresid_pdropresid_dropoutsetpruned_heads)rz   r`   r   r   max_positionsr{   s        r'   rw   zImageGPTAttention.__init__   s   6Juz=-"@
SSSTTYY1m]   	 	
 	
 	
 	]EL,>,>5QQQ+3$.8.=4>)T^;;'PTP^ ' 'N' ' '  
 #)";"4 06/U,"'-'E$" 	E T^!3T^DDDK @@DKK T^!3T^DDDKT^T^<<Jv'899Z(:;;EEr)   c                 
   t          |          dk    rd S t          || j        | j        | j                  \  }}t          j        ||| j        z   |d| j        z  z   g          }t          | j	        |d          | _	        t          | j
        |d          | _
        | j        | j        z  | j        t          |          z
  z  | _        | j        t          |          z
  | _        | j                            |          | _        d S )Nr   r=   r   dim)rS   r   r   r   r   rX   catr   r   r8   r:   union)rz   headsindex
index_attns       r'   prune_headszImageGPTAttention.prune_heads   s    u::??F7t~t}^b^oppuYut'>T_I\@]^__
 )jaHHH(eCCC  ?dn<RUV[R\R\A\]#e**4 -33E::r)   c                     t          j        ||                    dd                    }| j        r(|t	          |                    d          dz            z  }| j        r|t          | j        dz             z  }| j	        s|                    d          |                    d          }}| j
        d d d d ||z
  |d |f         }	t          j        |j                  j        }
t          j        |
|j        |j                  }
t          j        |	||
          }|||z   } t#          j        d          |          }|                    |j                  }|                     |          }|||z  }t          j        ||          }||fS )Nr*         ?r   r   devicer   )rX   matmul	transposer   r   sizer   r   r   r   r2   finfor   minr|   r   wherer   Softmaxtyper   )rz   querykeyvalueattention_mask	head_maskattn_weightsquery_length
key_lengthcausal_mask
mask_valueattn_outputs               r'   _attnzImageGPTAttention._attn   s   |E3==R+@+@AA" 	M'+ejjnn6K*L*LLL / 	D'%0B*C*CCL& 	N',zz"~~sxx||*L)AAAqqq*|*Cj*PR]S]R]$]^K\%788<J j8JS_SfgggJ ;{L*MML%'.8L)rzb))),77 $((55((66  ')3Ll<77L((r)   c                 r   |                                 \  }}}}	|                                 \  }
}
}}
t          j        ||z  ||t          j        |j                  }d}| j        r(|t          |                     d                    dz  z  }| j        r|t          | j        dz             z  }t          j	        |j        j
        d          5  |                    d||	          |                    dd                              d|	|          }}t          j        ||                                |                                d	|
          }|                    ||||          }d d d            n# 1 swxY w Y   | j        s|                     d          |                     d          }}| j        d d d d ||z
  |d |f         }t          j        |j                  j        }t          j        ||j        |j                  }t          j        |||          }|||z   } t+          j        d          |          }|j        t          j        k    rt/          d          |
                    |j                  }|                     |          }|||z  }t          j        ||          }||fS )Nr         ?r*   r   r   F)enabledr   r   )betaalphar   zDError with upcasting, attn_weights does not have dtype torch.float32)r   rX   emptyfloat32r   r   r   r   r   autocastr   rZ   r   baddbmmr   r2   r   r   r   r|   r   r   r   RuntimeErrorr   r   )rz   r   r   r   r   r   bszr   	q_seq_lendk_	k_seq_lenr   scale_factorqkr   r   r   r   r   s                        r'   _upcast_and_reordered_attnz,ImageGPTAttention._upcast_and_reordered_attn	  s   (-

%Y	2 XXZZ1i {3?IyPUP]fkfrsss " 	9E%**R..11S88L/ 	6E$.1"4555L ^EL-u=== 	V 	V==Y33S]]2r5J5J5R5RSUWY[d5e5eqA =qwwyy!''))RS[ghhhL'//Y	9UUL	V 	V 	V 	V 	V 	V 	V 	V 	V 	V 	V 	V 	V 	V 	V
 & 	N',zz"~~sxx||*L)AAAqqq*|*Cj*PR]S]R]$]^K\%788<J j8JS_SfgggJ ;{L*MML%'.8L)rzb))),77 ..efff#((55((66  ')3Ll<77L((s   BE44E8;E8c                     |                                 dd         ||fz   } |j        | }|                    dddd          S )zJ
        Splits hidden_size dim into attn_head_size and num_heads
        Nr*   r   r=   r   r   )r   r   permuterz   r|   r   attn_head_size	new_shapes        r'   _split_headszImageGPTAttention._split_heads=  sJ     KKMM#2#&)^)DD	i(~~aAq)))r)   c                     |                     dddd                                          }|                                dd         ||z  fz   }|                    |          S )zS
        Merges attn_head_size dim and num_attn_heads dim into hidden_size
        r   r=   r   r   Nr   )r   
contiguousr   r   r   s        r'   _merge_headszImageGPTAttention._merge_headsE  s\     1a++6688KKMM#2#&)n*D)FF	{{9%%%r)   hidden_states
layer_pastr   r   encoder_hidden_statesencoder_attention_mask	use_cacheoutput_attentionscache_positionr}   c
                 <   |d u}
|j         \  }}}|Ht          |t                    r1|j                            | j                  }|
r|j        }n
|j        }n|}|
r|n|}|
rt          | d          st          d          |G|rE| 
                    |          }|j        | j                 j        }|j        | j                 j        }nS| 
                    |          }|                     |                              | j        d          \  }}|                    |d| j        | j                                      dd          }|                    |d| j        | j                                      dd          }n|                     |                              | j        d          \  }}}|                    |d| j        | j                                      dd          }|                    |d| j        | j                                      dd          }|9|
s|	nd }	|                    ||| j        d|	i          \  }}|
rd|j        | j        <   |                    ||| j        | j                                      dd          }| j        r|                     |||||          \  }}n|                     |||||          \  }}|                     || j        | j                  }|                     |          }|                     |          }||fS )	Nr   zIf class is used as cross attention, the weights `q_attn` have to be defined. Please make sure to instantiate class with `ImageGPTAttention(..., is_cross_attention=True)`.r=   r   r*   r   r   T)rU   
isinstancer   
is_updatedgetr   cross_attention_cacheself_attention_cachehasattrr   r   layerskeysvaluesr8   rM   r   r   r   r   r   updater   r   r   r   r:   r   )rz   r   r   r   r   r   r   r   r   r   r   r   seq_lenr   r   curr_past_key_valuecurrent_statesr   r   r   r   r   s                         r'   r   zImageGPTAttention.forwardM  s3    3$>'-Wa!*&9:: 1'266t~FF
% J*4*J''*4*I''&0#2DW..- 	W4**  t  
 %*%M22)0@E+24>BIM22![[88>>tTU>VV
UhhsBFFPPQRTUVV

3DNDMJJTTUVXYZZ $N ; ; A A$/WX A Y YE3((3DNDMBBLLQPQRRCJJsBFFPPQRTUVVE!3EO^^4N,33CQacqPrssJC! =8<
%dn5

3GGQQRSUVWW' 	a(,(G(GsTY[ikt(u(u%K(,

5#unV_(`(`%K''T^T]SSkk+..((55L((r)   )FN)NNNNNNNFFN)r   r   r   r   r   rT   rw   r   r   r   r   r   rX   ry   r
   r   r   r   r   s   @r'   r   r      s       )" )"8D> )"V^_bVc )" )" )" )" )" )"V; ; ;$) $) $) $)L2) 2) 2) 2)h* * *& & & '+15,08<9=$),115D) D)|D) UOD) !.	D)
 EL)D)  (5D) !) 6D) D>D) $D>D) !.D) 
D) D) D) D) D) D) D) D)r)   r   c                   B     e Zd Z fdZdej        dej        fdZ xZS )ImageGPTMLPc                    t                                                       |j        }t          ||          | _        t          ||          | _        t          |j                 | _        t          j
        |j                  | _        d S ru   )rv   rw   rr   r   c_fcr:   r	   activation_functionactr   r   r   dropout)rz   intermediate_sizer`   r   r{   s       r'   rw   zImageGPTMLP.__init__  sl    &	,i88	Y(9::&45z&"455r)   r   r}   c                     |                      |          }|                     |          }|                     |          }|                     |          }|S ru   )r  r  r:   r  )rz   r   s     r'   r   zImageGPTMLP.forward  sL    		-00//M22]33r)   )r   r   r   rw   rX   ry   r   r   r   s   @r'   r  r    s^        6 6 6 6 6U\ el        r)   r  c                        e Zd Zd fd	Z	 	 	 	 	 	 	 	 ddej        dee         deej                 deej                 deej                 d	eej                 d
ee         dee         deej                 de	fdZ
 xZS )ImageGPTBlockNc                    t                                                       |j        }|j        |j        nd|z  }t	          ||j                  | _        t          ||          | _        t	          ||j                  | _	        |j
        r2t          |d|          | _        t	          ||j                  | _        t          ||          | _        d S )N   rs   r   T)r   r   )rv   rw   rr   n_innerrp   layer_norm_epsilonln_1r   r9   ln_2add_cross_attentioncrossattentionln_cross_attnr  mlp)rz   r`   r   rr   	inner_dimr{   s        r'   rw   zImageGPTBlock.__init__  s    (&,n&@FNNa+o	%kv7PQQQ	%f	BBB	%kv7PQQQ	% 	_"3Ft_h"i"i"iD!2;FD]!^!^!^Dy&11r)   Fr   r   r   r   r   r   r   r   r   r}   c
           
         |}
|                      |          }|                     |||||||	          }|d         }|dd          }||
z   }|qt          | d          st          d|  d          |}
|                     |          }|                     ||||||||	          }|d         }|
|z   }||dd          z   }|}
|                     |          }|                     |          }|
|z   }|f|z   S )N)r   r   r   r   r   r   r   r   r  z'If `encoder_hidden_states` are passed, z` has to be instantiated with cross-attention layers by setting `config.add_cross_attention=True`)r   r   r   r   r   r   r   )r  r9   r   r   r  r  r  r  )rz   r   r   r   r   r   r   r   r   r   residualattn_outputsr   outputscross_attn_outputsfeed_forward_hidden_statess                   r'   r   zImageGPTBlock.forward  sg    !		-00yy!)/) ! 
 
 #1oqrr"#h. ,4!122  Zd Z Z Z   %H ..}==M!%!4!4%-#&;'="3- "5 	" 	" -Q/K${2M 2122 66G 		-00%)XXm%<%<" #=='))r)   ru   r   )r   r   r   rw   rX   ry   r   r
   r   r   r   r   r   s   @r'   r  r    s       2 2 2 2 2 2$ '+15,08<9=$),115:* :*|:* UO:* !.	:*
 EL):*  (5:* !) 6:* D>:* $D>:* !.:* 
:* :* :* :* :* :* :* :*r)   r  c                   F     e Zd ZU eed<   eZdZdZdZ	dgZ
 fdZd Z xZS )ImageGPTPreTrainedModelr`   r-   	input_idsTr  c                 :     t                      j        |i | d S ru   )rv   rw   )rz   inputskwargsr{   s      r'   rw   z ImageGPTPreTrainedModel.__init__  s%    &+F+++++r)   c           	         t          |t          j        t          f          rQ|j        j                            d| j        j                   |j	        |j	        j        
                                 nt          |t          j                  r\|j        j                            d| j        j                   |j        )|j        j        |j                 
                                 n4t          |t                    r|j        j                            d           |                                D ]U\  }}d|v rLd|v rH|j                            d| j        j        t!          j        d| j        j        z            z             VdS )zInitialize the weights.g        )r   stdNr   r:   r0   r=   )r   r   Linearr   r0   r]   normal_r`   initializer_ranger2   zero_	Embeddingpadding_idxrp   fill_named_parametersmathr   n_layer)rz   modulerg   ps       r'   _init_weightsz%ImageGPTPreTrainedModel._init_weights  sj   fry&122 	* M&&CT[5R&SSS{& &&(((-- 	*M&&CT[5R&SSS!-"6#56<<>>> 122 	*M$$S))) ..00 	s 	sGD!4H$4$4Cdk.KdiXY\`\g\oXoNpNp.prrr	s 	sr)   )r   r   r   r   __annotations__rn   load_tf_weightsbase_model_prefixmain_input_namesupports_gradient_checkpointing_no_split_modulesrw   r3  r   r   s   @r'   r   r     s{         1O%!O&*#(), , , , ,s s s s s s sr)   r   c            $           e Zd Zdef fdZd Zd Zd Ze	 	 	 	 	 	 	 	 	 	 	 	 	 	 dde	e
j                 de	e         d	e	e
j                 d
e	e
j                 de	e
j                 de	e
j                 de	e
j                 de	e
j                 de	e
j                 de	e         de	e         de	e         de	e         de	e
j                 dedeeef         f d            Z xZS )ImageGPTModelr`   c                 0   t                                                     j        | _        t	          j        j        | j                  | _        t	          j        j        | j                  | _	        t	          j
        j                  | _        t	          j        fdt          j                  D                       | _        t#          | j        j                  | _        d| _        d | _        d| _        |                                  d S )Nc                 2    g | ]}t          |           S )r  )r  )r%   ir`   s     r'   
<listcomp>z*ImageGPTModel.__init__.<locals>.<listcomp>$  s&    lllqf B B Blllr)   r  F)rv   rw   rr   r   r   r+  r^   r4   r   r3   r   
embd_pdropdrop
ModuleListrangenum_hidden_layershrp   r  ln_fmodel_parallel
device_mapgradient_checkpointing	post_initrz   r`   r{   s    `r'   rw   zImageGPTModel.__init__  s       +< 14>BB< >OOJv011	llllERXRjLkLklllmm%dn&:STTT	 $&+#r)   c                     | j         S ru   r4   )rz   s    r'   get_input_embeddingsz"ImageGPTModel.get_input_embeddings.  s	    xr)   c                     || _         d S ru   rM  )rz   new_embeddingss     r'   set_input_embeddingsz"ImageGPTModel.set_input_embeddings1  s    !r)   c                     |                                 D ]*\  }}| j        |         j                            |           +dS )zv
        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer}
        N)itemsrE  r9   r   )rz   heads_to_prunelayerr   s       r'   _prune_headszImageGPTModel._prune_heads4  sM     +0022 	2 	2LE5F5M**51111	2 	2r)   Nr!  past_key_valuesr   token_type_idsposition_idsr   inputs_embedsr   r   r   r   output_hidden_statesreturn_dictr   r$  r}   c                 
   ||n| j         j        }||n| j         j        }|
|
n| j         j        }
||n| j         j        }||t          d          |T|                     ||           |                                }|                    d|d                   }|j	        d         }n;|*|                                dd         }|j	        d         }nt          d          ||j
        n|j
        }| j        r%| j        r|
rt                              d           d}
|
r8|6t          t!          | j                   t!          | j                             }|
rCt#          |t$                    r.t                              d	           t          j        |          }||                                n|}||                    d|d                   }|@t+          j        ||d         |z   t*          j        |
          }|                    d          }|z|dk    rt          d          |                    |d          }|ddddddf         }|                    | j                  }d|z
  t+          j        | j                  j        z  }| j         j        rL|J|                                \  }}}||f}|	t+          j        ||          }	|                     |	          }	nd}	|                      || j         j!                  }|| "                    |          }| #                    |          }||                    |j
                  z   }|| "                    |          }||z   }| $                    |          }||                    d          fz   }|rdnd}|r| j         j        rdnd}|rdnd}tK          | j&                  D ]F\  }} | j'        rtt*          j(        )                    |j
                   ||                    |j
                  }t#          |t*          j*                  r|                    |j
                  }|r||fz   } | |||||         ||	|
||	  	        }!|!d         }|r$||!d         fz   }| j         j        r||!d         fz   }| j'        rn| j+        ,                                D ]T\  }"}#||#d         k    rCdt[          |"          z   | j.        k    r(|                    dt[          |"dz             z             }UH| /                    |          } |j        | }|r||fz   }|st%          d |||||fD                       S ta          |||||          S )a  
        input_ids (`torch.LongTensor` of shape `(batch_size, input_ids_length)`):
            `input_ids_length` = `sequence_length` if `past_key_values` is `None` else
            `past_key_values.get_seq_length()` (`sequence_length` of input past key value states). Indices of input
            sequence tokens in the vocabulary.

            If `past_key_values` is used, only `input_ids` that do not have their past calculated should be passed as
            `input_ids`.

            Indices can be obtained using [`AutoImageProcessor`]. See [`ImageGPTImageProcessor.__call__`] for details.

        Examples:

        ```python
        >>> from transformers import AutoImageProcessor, ImageGPTModel
        >>> from PIL import Image
        >>> import requests

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> image_processor = AutoImageProcessor.from_pretrained("openai/imagegpt-small")
        >>> model = ImageGPTModel.from_pretrained("openai/imagegpt-small")

        >>> inputs = image_processor(images=image, return_tensors="pt")
        >>> outputs = model(**inputs)
        >>> last_hidden_states = outputs.last_hidden_state
        ```NzDYou cannot specify both input_ids and inputs_embeds at the same timer*   r   z5You have to specify either input_ids or inputs_embedszZ`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...F)r`   zPassing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.58.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.r   z$batch_size has to be defined and > 0r   r   )r   r$   )r   r   r   r   r   r=   zcuda:c              3      K   | ]}||V  	d S ru   r$   )r%   vs     r'   r(   z(ImageGPTModel.forward.<locals>.<genexpr>  s0        =  === r)   )last_hidden_staterW  r   
attentionscross_attentions)1r`   r   r[  r   use_return_dictr   %warn_if_padding_and_no_attention_maskr   r   rU   r   rI  trainingrA   warning_oncer   r   r   r   from_legacy_cacheget_seq_lengthrX   arangelong	unsqueezetor   r   r   r  r   invert_attention_maskget_head_maskr0  r4   r3   rA  	enumeraterE  rG  cuda
set_devicery   rH  rS  strlast_devicerF  r   )$rz   r!  rW  r   rX  rY  r   rZ  r   r   r   r   r[  r\  r   r$  input_shape
batch_sizer   past_lengthencoder_batch_sizeencoder_sequence_lengthr   encoder_hidden_shapeposition_embedsr   token_type_embedsoutput_shapeall_self_attentionsall_cross_attentionsall_hidden_statesr>  blockr  r   r_  s$                                       r'   r   zImageGPTModel.forward;  sv   ` 2C1N--TXT_Tq$8$D  $+Jj 	 "+!6IIDK<Q	%0%<kk$+B] ]%>cddd"66y.QQQ#..**K!r;r?;;I"+JJ&',,..ss3K&,Q/JJTUUU%.%:!!@T& 	"4= 	" "##p   "	 	v01,dk2R2R2RT`hlhsTtTtTtuuO 	UOU;; 	U\  
 2COTTO:I:Uo44666[j%+00[_EEN <[_{5RZ_ZdmstttL'11!44L %Q !GHHH+00R@@N ,AAAtT111,<=N ,..TZ.@@N!N2ek$*6M6M6QQN ;* 	*/D/P=R=W=W=Y=Y: 7$68O#P %-).4HQW)X)X)X&%)%?%?@V%W%W""%)" &&y$+2EFF	  HHY//M((<00%(:(:=;O(P(PP% $ 8 8),==M		-00"m&8&8&<&<%>>$5?bb4%6d4;;Zdrr`d"6@BBD!$&)) "	O "	OHAu" C
%%m&:;;;!-%3%6%6}7K%L%LNi66 C )]-A B BI# I$58H$H!e!%'=#"3-
 
 
G $AJM  P&9WQZM&I#;2 P+?71:-+O( " O O1133 O ODAqAbEzzgA&6$:J&J&J(5(8(83q1u::9M(N(N		-00**L9   	E 1]4D D 	  ':KM`bvw      9+++*1
 
 
 	
r)   )NNNNNNNNNNNNNN)r   r   r   r   rw   rN  rQ  rV  r   r   rX   ry   r
   r   r   r   r   r   r   r   r   s   @r'   r;  r;    s       ~      &  " " "2 2 2  -1+/1515/3,0048<9=$(,0/3&*15K
 K
EL)K
 "%K
 !.	K

 !.K
 u|,K
 EL)K
  -K
  (5K
 !) 6K
 D>K
 $D>K
 'tnK
 d^K
 !.K
  !K
" 
u??	@#K
 K
 K
 ^K
 K
 K
 K
 K
r)   r;  z
    The ImageGPT Model transformer with a language modeling head on top (linear layer with weights tied to the input
    embeddings).
    )custom_introc            &           e Zd ZdgZdef fdZe	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 ddeej	                 dee
         deej	                 deej	                 d	eej	                 d
eej	                 deej	                 deej	                 deej	                 deej	                 dee         dee         dee         dee         deej	                 dedeeef         f"d            Z xZS )ImageGPTForCausalImageModelingzlm_head.weightr`   c                    t                                          |           t          |          | _        t	          j        |j        |j        dz
  d          | _        d| _	        d | _
        |                                  d S )Nr   Fr2   )rv   rw   r;  r-   r   r'  r[   r^   r;   rG  rH  rJ  rK  s     r'   rw   z'ImageGPTForCausalImageModeling.__init__  st       (00y0AA0EERRR $r)   Nr!  rW  r   rX  rY  r   rZ  r   r   labelsr   r   r[  r\  r   r$  r}   c                 V   ||n| j         j        }|                     |||||||||	|||||          }|d         }|                     |          }d}|
|dddddf                                         }|
dddf                                         }t                      } ||                    d|                    d                    |                    d                    }|s|f|dd         z   }||f|z   n|S t          |||j	        |j
        |j        |j                  S )a&
  
        input_ids (`torch.LongTensor` of shape `(batch_size, input_ids_length)`):
            `input_ids_length` = `sequence_length` if `past_key_values` is `None` else
            `past_key_values.get_seq_length()` (`sequence_length` of input past key value states). Indices of input
            sequence tokens in the vocabulary.

            If `past_key_values` is used, only `input_ids` that do not have their past calculated should be passed as
            `input_ids`.

            Indices can be obtained using [`AutoImageProcessor`]. See [`ImageGPTImageProcessor.__call__`] for details.
        labels (`torch.LongTensor` of shape `(batch_size, input_ids_length)`, *optional*):
            Labels for language modeling. Note that the labels **are shifted** inside the model, i.e. you can set
            `labels = input_ids` Indices are selected in `[-100, 0, ..., config.vocab_size]` All labels set to `-100`
            are ignored (masked), the loss is only computed for labels in `[0, ..., config.vocab_size]`

        Examples:

        ```python
        >>> from transformers import AutoImageProcessor, ImageGPTForCausalImageModeling
        >>> import torch
        >>> import matplotlib.pyplot as plt
        >>> import numpy as np

        >>> image_processor = AutoImageProcessor.from_pretrained("openai/imagegpt-small")
        >>> model = ImageGPTForCausalImageModeling.from_pretrained("openai/imagegpt-small")
        >>> device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        >>> model.to(device)  # doctest: +IGNORE_RESULT

        >>> # unconditional generation of 8 images
        >>> batch_size = 4
        >>> context = torch.full((batch_size, 1), model.config.vocab_size - 1)  # initialize with SOS token
        >>> context = context.to(device)
        >>> output = model.generate(
        ...     input_ids=context, max_length=model.config.n_positions + 1, temperature=1.0, do_sample=True, top_k=40
        ... )

        >>> clusters = image_processor.clusters
        >>> height = image_processor.size["height"]
        >>> width = image_processor.size["width"]

        >>> samples = output[:, 1:].detach().cpu().numpy()
        >>> samples_img = [
        ...     np.reshape(np.rint(127.5 * (clusters[s] + 1.0)), [height, width, 3]).astype(np.uint8) for s in samples
        ... ]  # convert color cluster tokens back to pixels
        >>> f, axes = plt.subplots(1, batch_size, dpi=300)

        >>> for img, ax in zip(samples_img, axes):  # doctest: +IGNORE_RESULT
        ...     ax.axis("off")
        ...     ax.imshow(img)
        ```N)rW  r   rX  rY  r   rZ  r   r   r   r   r[  r\  r   r   .r*   r   )losslogitsrW  r   ra  rb  )r`   rc  r-   r;   r   r   r   r   r   rW  r   ra  rb  )rz   r!  rW  r   rX  rY  r   rZ  r   r   r  r   r   r[  r\  r   r$  transformer_outputsr   	lm_logitsr  shift_logitsshift_labelsloss_fctoutputs                            r'   r   z&ImageGPTForCausalImageModeling.forward  sz   N &1%<kk$+B]"..+))%'"7#9/!5#) / 
 
  ,A.LL//	$S#2#qqq[1<<>>L!#qrr'?5577L'))H8L--b,2C2CB2G2GHH,J[J[\^J_J_``D 	F\$7$;;F)-)9TGf$$vE0/?-;*50A
 
 
 	
r)   )NNNNNNNNNNNNNNN)r   r   r   _tied_weights_keysr   rw   r   r   rX   ry   r
   r   r   r   r   r   r   r   r   s   @r'   r  r  
  s        ++	~ 	 	 	 	 	 	  -1+/1515/3,0048<9=)-$(,0/3&*15!p
 p
EL)p
 "%p
 !.	p

 !.p
 u|,p
 EL)p
  -p
  (5p
 !) 6p
 &p
 D>p
 $D>p
 'tnp
 d^p
  !.!p
" #p
$ 
u77	8%p
 p
 p
 ^p
 p
 p
 p
 p
r)   r  z
    The ImageGPT Model transformer with an image classification head on top (linear layer).
    [`ImageGPTForImageClassification`] average-pools the hidden states in order to do the classification.
    c                    t    e Zd Zdef fdZe	 	 	 	 	 	 	 	 	 	 	 	 ddeej                 dee	         deej                 deej                 deej                 d	eej                 d
eej                 deej                 dee
         dee
         dee
         dee
         dedeeef         fd            Z xZS )ImageGPTForImageClassificationr`   c                     t                                          |           |j        | _        t          |          | _        t          j        |j        | j        d          | _        | 	                                 d S )NFr  )
rv   rw   
num_labelsr;  r-   r   r'  r[   scorerJ  rK  s     r'   rw   z'ImageGPTForImageClassification.__init__  si        +(00Yv}doEJJJ
 	r)   Nr!  rW  r   rX  rY  r   rZ  r  r   r   r[  r\  r$  r}   c                    ||n| j         j        }|                     ||||||||	|
||          }|d         }|                    d          }|                     |          }d}||                     ||| j                   }|s|f|dd         z   }||f|z   n|S t          |||j        |j        |j	                  S )ay  
        input_ids (`torch.LongTensor` of shape `(batch_size, input_ids_length)`):
            `input_ids_length` = `sequence_length` if `past_key_values` is `None` else
            `past_key_values.get_seq_length()` (`sequence_length` of input past key value states). Indices of input
            sequence tokens in the vocabulary.

            If `past_key_values` is used, only `input_ids` that do not have their past calculated should be passed as
            `input_ids`.

            Indices can be obtained using [`AutoImageProcessor`]. See [`ImageGPTImageProcessor.__call__`] for details.
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).

        Examples:

        ```python
        >>> from transformers import AutoImageProcessor, ImageGPTForImageClassification
        >>> from PIL import Image
        >>> import requests

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> image_processor = AutoImageProcessor.from_pretrained("openai/imagegpt-small")
        >>> model = ImageGPTForImageClassification.from_pretrained("openai/imagegpt-small")

        >>> inputs = image_processor(images=image, return_tensors="pt")
        >>> outputs = model(**inputs)
        >>> logits = outputs.logits
        ```N)
rW  r   rX  rY  r   rZ  r   r   r[  r\  r   r   r   )r  r  rW  r   ra  )
r`   rc  r-   r   r  loss_functionr   rW  r   ra  )rz   r!  rW  r   rX  rY  r   rZ  r  r   r   r[  r\  r$  r  r   pooled_hidden_statesr  r  r  s                       r'   r   z&ImageGPTForImageClassification.forward  s   d &1%<kk$+B]"..+))%'/!5# / 
 
 ,A.,11a188011%%ffdkBBD 	FY!4QRR!88F)-)9TGf$$vE//?-;*5
 
 
 	
r)   )NNNNNNNNNNNN)r   r   r   r   rw   r   r   rX   ry   r
   r   r   r   r   r   r   r   r   s   @r'   r  r    s       ~        -1+/1515/3,004)-$(,0/3&*T
 T
EL)T
 "%T
 !.	T

 !.T
 u|,T
 EL)T
  -T
 &T
 D>T
 $D>T
 'tnT
 d^T
 T
 
u66	7T
 T
 T
 ^T
 T
 T
 T
 T
r)   r  )r  r  r;  r   rn   )3__doc__r/  rC   typingr   r   r   rX   r   torch.nnr   activationsr	   cache_utilsr
   r   r   
generationr   modeling_layersr   modeling_outputsr   r   r   modeling_utilsr   pytorch_utilsr   r   r   utilsr   r   r   configuration_imagegptr   
get_loggerr   rA   rn   Modulerp   r   r  r  r   r;  r  r  __all__r$   r)   r'   <module>r     s   % $  				 ' ' ' ' ' ' ' ' ' '        % % % % % % ! ! ! ! ! ! C C C C C C C C C C ) ) ) ) ) ) 9 9 9 9 9 9         
 . - - - - - Y Y Y Y Y Y Y Y Y Y         
 3 2 2 2 2 2 
	H	%	%i i iX
 
 
 
 
	 
 
 
i) i) i) i) i)	 i) i) i)X    ")   "J* J* J* J* J*. J* J* J*Z #s #s #s #s #so #s #s #sL m
 m
 m
 m
 m
+ m
 m
 m
`   
 
 
 
 
%<o 
 
 
D   _
 _
 _
 _
 _
%< _
 _
 _
D  r)   