
     `iD                        d Z ddlZddlmZ ddlmZmZmZ ddlZddlm	Z	 ddl
mZ ddlmZmZ dd	lmZ dd
lmZ ddlmZ ddlmZmZmZmZ ddlmZmZ ddlmZmZm Z  ddl!m"Z"m#Z#m$Z$m%Z%m&Z& ddl'm(Z( ddl)m*Z*m+Z+  e%j,        e-          Z.e e#d           G d de"                                  Z/ G d de	j0                  Z1 G d de	j0                  Z2 G d de	j0                  Z3de2iZ4 G d de	j0                  Z5 G d  d!e	j0                  Z6 G d" d#e	j0                  Z7 G d$ d%e          Z8 G d& d'e	j0                  Z9e# G d( d)e                      Z: G d* d+e	j0                  Z; G d, d-e	j0                  Z<	 dKd/e	j0        d0ej=        d1ej=        d2ej=        d3eej=                 d4e>d5e>fd6Z? G d7 d8e	j0                  Z@ G d9 d:e          ZA G d; d<e	j0                  ZB G d= d>e	j0                  ZC e#d?           G d@ dAe:                      ZD G dB dCe	j0                  ZE e#dD           G dE dFe:                      ZF e#dG           G dH dIe:e                      ZGg dJZHdS )LzPyTorch GIT model.    N)	dataclass)CallableOptionalUnion)nn   )ACT2FN)CacheDynamicCache)GenerationMixin)_prepare_4d_attention_mask)GradientCheckpointingLayer)BaseModelOutputBaseModelOutputWithPastBaseModelOutputWithPoolingCausalLMOutputWithPast)ALL_ATTENTION_FUNCTIONSPreTrainedModel)apply_chunking_to_forward find_pruneable_heads_and_indicesprune_linear_layer)ModelOutputauto_docstringcan_return_tuplelogging	torch_int)deprecate_kwarg   )	GitConfigGitVisionConfigz}
    Base class for vision model's outputs that also contains image embeddings of the pooling of the last hidden states.
    )custom_introc                       e Zd ZU dZdZeej                 ed<   dZ	eej                 ed<   dZ
eeej        df                  ed<   dZeeej        df                  ed<   dS )GitVisionModelOutputz
    image_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim)` *optional* returned when model is initialized with `with_projection=True`):
        The image embeddings obtained by applying the projection layer to the pooler_output.
    Nimage_embedslast_hidden_state.hidden_states
attentions)__name__
__module____qualname____doc__r$   r   torchFloatTensor__annotations__r%   r&   tupler'        x/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/transformers/models/git/modeling_git.pyr#   r#   4   s          
 15L(5,-44459x 12999=AM8E%"3S"89:AAA:>Ju0#567>>>>>r1   r#   c                        e Zd ZdZ fdZ	 	 	 	 ddeej                 deej                 deej                 de	d	ej
        f
d
Z xZS )GitEmbeddingsz;Construct the embeddings from word and position embeddings.c                    t                                                       t          j        |j        |j        |j                  | _        t          j        |j        |j                  | _	        t          j
        |j        |j                  | _
        t          j        |j                  | _        t          |dd          | _        |                     dt%          j        |j                                      d          d           d S )	N)padding_idxepsposition_embedding_typeabsoluteposition_idsr   F
persistent)super__init__r   	Embedding
vocab_sizehidden_sizepad_token_idword_embeddingsmax_position_embeddingsposition_embeddings	LayerNormlayer_norm_epsDropouthidden_dropout_probdropoutgetattrr9   register_bufferr,   arangeexpandselfconfig	__class__s     r2   rA   zGitEmbeddings.__init__J   s    !|F,=v?Q_e_rsss#%<0NPVPb#c#c  f&8f>STTTz&"<=='.v7PR\']']$EL)GHHOOPWXXej 	 	
 	
 	
 	
 	
r1   Nr   	input_idsr;   inputs_embedspast_key_values_lengthreturnc                 ~   ||                                 }n|                                 d d         }|d         }|| j        d d |||z   f         }||                     |          }n|}| j        dk    r|                     |          }||z  }|                     |          }|                     |          }|S )Nr=   r   r:   )sizer;   rF   r9   rH   rI   rM   )	rS   rV   r;   rW   rX   input_shape
seq_length
embeddingsrH   s	            r2   forwardzGitEmbeddings.forwardY   s      #..**KK',,..ss3K ^
,QQQ0FVlIl0l-lmL --i88JJ&J':55"&":":<"H"H--J^^J//
\\*--
r1   )NNNr   )r(   r)   r*   r+   rA   r   r,   
LongTensorr-   intTensorr_   __classcell__rU   s   @r2   r4   r4   G   s        EE
 
 
 
 
" 153759&' E,- u/0   12	
 !$ 
       r1   r4   c                        e Zd Zd fd	Z eddd          	 	 	 	 	 ddej        d	eej                 d
eej                 dee	         dee
         dee
         deej                 fd            Z xZS )GitSelfAttentionNc                 ,   t                                                       |j        |j        z  dk    r0t	          |d          s t          d|j         d|j         d          || _        |(t                              d| j	        j
         d           |j        | _        t          |j        |j        z            | _        | j        | j        z  | _        t          |j        j        |j        j        z  dz  d	z             | _        |j        | xj        |j        z  c_        t'          j        |j        | j                  | _        t'          j        |j        | j                  | _        t'          j        |j        | j                  | _        t'          j        |j                  | _        |pt7          |d
d          | _        | j        dk    s| j        dk    r8|j        | _        t'          j        d|j        z  d	z
  | j                  | _        d S d S )Nr   embedding_sizezThe hidden size (z6) is not a multiple of the number of attention heads ()zInstantiating z without passing a `layer_idx` is not recommended and will lead to errors during the forward call if caching is used. Please make sure to provide a `layer_idx` when creating this class.   r   r9   r:   relative_keyrelative_key_query) r@   rA   rD   num_attention_headshasattr
ValueError	layer_idxloggerwarning_oncerU   r(   ra   attention_head_sizeall_head_sizevision_config
image_size
patch_sizeimage_patch_tokensnum_image_with_embeddingr   LinearquerykeyvaluerK   attention_probs_dropout_probrM   rN   r9   rG   rB   distance_embeddingrS   rT   r9   rp   rU   s       r2   rA   zGitSelfAttention.__init__x   s%    ::a??PVXhHiHi?8F$6 8 8 48 8 8   #,!8 , , ,   $*#= #&v'9F<V'V#W#W !58PP"%v';'FI]Ih'hmn&nqr&r"s"s*6##v'FF##Yv143EFF
9V/1CDDYv143EFF
z&"EFF'> (
'-zC
 C
$ '>99T=Y]q=q=q+1+ID(&(l1v7U3UXY3Y[_[s&t&tD### >r=qr1   past_key_valuepast_key_values4.58new_nameversionFr&   attention_mask	head_maskoutput_attentionspixel_values_presentrY   c           	      8   |j         \  }}}	|                     |                              |d| j        | j                                      dd          }
|r| j        nd}|                     |                              |d| j        | j                                      dd          }|                     |                              |d| j        | j                                      dd          }||	                    |d d d d |d d d f         |d d d d |d d d f         | j
                  \  }}t          j        |d d d d d |d d f         |gd          }t          j        |d d d d d |d d f         |gd          }t          j        |
|                    dd                    }| j        dk    s| j        dk    rt|
j         d         |j         d         }}|>t          j        |dz
  t          j        |j        	                              dd          }n:t          j        |t          j        |j        	                              dd          }t          j        |t          j        |j        	                              dd          }||z
  }|                     || j        z   dz
            }|                    |
j        
          }| j        dk    rt          j        d|
|          }||z   }n?| j        dk    r4t          j        d|
|          }t          j        d||          }||z   |z   }|t1          j        | j                  z  }|||z   }t4          j                            |d          }|                     |          }|||z  }t          j        ||          }|                    dddd                                          }|                                 d d         | j!        fz   }|                    |          }||fS )Nr=   r   rj   r   dimrk   rl   dtypedevicer   zbhld,lrd->bhlrzbhrd,lrd->bhlrr   )"shaper{   viewrm   rs   	transposerx   r|   r}   updaterp   r,   catmatmulr9   tensorlongr   rP   r   rG   tor   einsummathsqrtr   
functionalsoftmaxrM   permute
contiguousr[   rt   )rS   r&   r   r   r   r   r   
batch_sizer]   _query_layercutoff	key_layervalue_layerkey_layer_pastvalue_layer_pastattention_scoresquery_length
key_lengthposition_ids_lposition_ids_rdistancepositional_embeddingrelative_position_scoresrelative_position_scores_queryrelative_position_scores_keyattention_probscontext_layernew_context_layer_shapes                                r2   r_   zGitSelfAttention.forward   sR    %2$7!
JJJ}%%T*b$":D<TUUYq!__ 	 -AG((aHH]##T*b$":D<TUUYq!__ 	 JJ}%%T*b$":D<TUUYq!__ 	
 &/>/E/E!!!QQQ*+[AAAvww9I-JDN0 0,N, 	9QQQ7F7AAA-=#>"OUVWWWI)[AAAww1A%BDT$U[\]]]K !<Y5H5HR5P5PQQ'>99T=Y]q=q=q'2'8';Y_Q=O*L*!&j1nEJWdWk!l!l!l!q!q" " "'l%*UbUi!j!j!j!o!oprtu!v!v"\*EJ}OcdddiijkmoppN%6H#'#:#:8dFb;bef;f#g#g #7#:#:AR#:#S#S +~==+0<8H+Wk+l+l(#36N#N  -1EEE16>NP[]q1r1r./4|<LiYm/n/n,#36T#TWs#s +di8P.Q.QQ%/.@ -//0@b/II ,,77  -	9O_kBB%--aAq99DDFF"/"4"4"6"6ss";t?Q>S"S%**+BCCo--r1   NNNNNFF)r(   r)   r*   rA   r   r,   rb   r   r-   r
   boolr/   r_   rc   rd   s   @r2   rf   rf   w   s        u  u  u  u  u  uD _%0A6RRR 7;15+/,1/4R. R.|R. !!23R. E-.	R.
 "%R. $D>R. 'tnR. 
u|	R. R. R. SRR. R. R. R. R.r1   rf   c                   P     e Zd Z fdZdej        dej        dej        fdZ xZS )GitSelfOutputc                    t                                                       t          j        |j        |j                  | _        t          j        |j        |j                  | _        t          j        |j	                  | _
        d S Nr7   )r@   rA   r   rz   rD   denserI   rJ   rK   rL   rM   rR   s     r2   rA   zGitSelfOutput.__init__   sf    Yv163EFF
f&8f>STTTz&"<==r1   r&   input_tensorrY   c                     |                      |          }|                     |          }|                     ||z             }|S Nr   rM   rI   rS   r&   r   s      r2   r_   zGitSelfOutput.forward   @    

=11]33}|'CDDr1   r(   r)   r*   rA   r,   rb   r_   rc   rd   s   @r2   r   r      i        > > > > >U\  RWR^        r1   r   eagerc                        e Zd Zd fd	Zd Z eddd          	 	 	 	 	 dd	ej        d
eej	                 deej	                 dee
         dee         dee         deej                 fd            Z xZS )GitAttentionNc                     t                                                       t          |j                 |||          | _        t          |          | _        t                      | _        d S )N)r9   rp   )	r@   rA   GIT_SELF_ATTENTION_CLASSES_attn_implementationrS   r   outputsetpruned_headsr   s       r2   rA   zGitAttention.__init__  sb    .v/JK,Cy
 
 
	 $F++EEr1   c                    t          |          dk    rd S t          || j        j        | j        j        | j                  \  }}t          | j        j        |          | j        _        t          | j        j        |          | j        _        t          | j        j	        |          | j        _	        t          | j
        j        |d          | j
        _        | j        j        t          |          z
  | j        _        | j        j        | j        j        z  | j        _        | j                            |          | _        d S )Nr   r   r   )lenr   rS   rm   rs   r   r   r{   r|   r}   r   r   rt   union)rS   headsindexs      r2   prune_headszGitAttention.prune_heads  s    u::??F7490$)2OQUQb
 
u
 -TY_eDD	*49=%@@	,TY_eDD	.t{/@%QOOO )-	(EE

(R	%"&)"?$)B_"_	 -33E::r1   r   r   r   r   Fr&   r   r   r   r   rY   c                 p    |                      ||||||          \  }}|                     ||          }	|	|fS r   )rS   r   )
rS   r&   r   r   r   r   r   attn_outputself_attn_weightsattention_outputs
             r2   r_   zGitAttention.forward   sQ     *. *
 *
&&  ;;{MBB!222r1   r   r   )r(   r)   r*   rA   r   r   r,   rb   r   r-   r
   r   r/   r_   rc   rd   s   @r2   r   r     s        " " " " " "; ; ;$ _%0A6RRR 7;15+/,1/43 3|3 !!233 E-.	3
 "%3 $D>3 'tn3 
u|	3 3 3 SR3 3 3 3 3r1   r   c                   B     e Zd Z fdZdej        dej        fdZ xZS )GitIntermediatec                    t                                                       t          j        |j        |j                  | _        t          |j        t                    rt          |j                 | _        d S |j        | _        d S r   )r@   rA   r   rz   rD   intermediate_sizer   
isinstance
hidden_actstrr	   intermediate_act_fnrR   s     r2   rA   zGitIntermediate.__init__8  sn    Yv163KLL
f'-- 	9'-f.?'@D$$$'-'8D$$$r1   r&   rY   c                 Z    |                      |          }|                     |          }|S r   )r   r   rS   r&   s     r2   r_   zGitIntermediate.forward@  s,    

=1100??r1   r   rd   s   @r2   r   r   7  s^        9 9 9 9 9U\ el        r1   r   c                   P     e Zd Z fdZdej        dej        dej        fdZ xZS )	GitOutputc                    t                                                       t          j        |j        |j                  | _        t          j        |j        |j                  | _        t          j	        |j
                  | _        d S r   )r@   rA   r   rz   r   rD   r   rI   rJ   rK   rL   rM   rR   s     r2   rA   zGitOutput.__init__H  sf    Yv79KLL
f&8f>STTTz&"<==r1   r&   r   rY   c                     |                      |          }|                     |          }|                     ||z             }|S r   r   r   s      r2   r_   zGitOutput.forwardN  r   r1   r   rd   s   @r2   r   r   G  r   r1   r   c                        e Zd Zd fd	Z eddd          	 	 	 	 	 ddej        d	eej                 d
eej                 dee	         dee
         dee
         deej                 fd            Zd Z xZS )GitLayerNc                     t                                                       |j        | _        d| _        t	          ||          | _        t          |          | _        t          |          | _	        d S )Nr   )rp   )
r@   rA   chunk_size_feed_forwardseq_len_dimr   	attentionr   intermediater   r   )rS   rT   rp   rU   s      r2   rA   zGitLayer.__init__V  sc    '-'E$%f	BBB+F33''r1   r   r   r   r   Fr&   r   r   r   r   rY   c                     |                      ||||||          \  }}t          | j        | j        | j        |          }	|	|fS )N)r   r   r   )r   r   feed_forward_chunkr   r   )
rS   r&   r   r   r   r   r   r   self_attention_weightslayer_outputs
             r2   r_   zGitLayer.forward^  sh     48>>/+!5 4B 4
 4
00 1#T%A4CSUe
 
 333r1   c                 \    |                      |          }|                     ||          }|S r   )r   r   )rS   r   intermediate_outputr   s       r2   r   zGitLayer.feed_forward_chunkw  s2    "//0@AA{{#68HIIr1   r   r   )r(   r)   r*   rA   r   r,   rb   r   r-   r
   r   r/   r_   r   rc   rd   s   @r2   r   r   U  s        ( ( ( ( ( ( _%0A6RRR 7;15+/,1/44 4|4 !!234 E-.	4
 "%4 $D>4 'tn4 
u|	4 4 4 SR40      r1   r   c                   6    e Zd Z fdZ	 	 	 	 	 	 	 	 ddej        deej                 deej                 deee	e
e
ej                          f                  d	ee         d
ee         dee         dee         dee         dee
ej                 ef         fdZ xZS )
GitEncoderc                     t                                                       | _        t          j        fdt          j                  D                       | _        d| _        d S )Nc                 0    g | ]}t          |          S r0   )r   ).0irT   s     r2   
<listcomp>z'GitEncoder.__init__.<locals>.<listcomp>  s#    #a#a#aAHVQ$7$7#a#a#ar1   F)	r@   rA   rT   r   
ModuleListrangenum_hidden_layerslayergradient_checkpointingrR   s    `r2   rA   zGitEncoder.__init__~  s`    ]#a#a#a#avG_A`A`#a#a#abb
&+###r1   NFTr&   r   r   r   	use_cacher   output_hidden_statesr   return_dictrY   c
           	         | j         r%| j        r|rt                              d           d}|r|t	          | j                  }|rdnd }
|rdnd }t          | j                  D ]?\  }}|r|
|fz   }
|||         nd } |||||||          }|d         }|r||d         fz   }@|r|
|fz   }
|	st          d |||
|fD                       S t          |||
|          S )	NzZ`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...F)rT   r0   r   r   c              3      K   | ]}||V  	d S r   r0   )r   vs     r2   	<genexpr>z%GitEncoder.forward.<locals>.<genexpr>  s4       	 	 =  !===	 	r1   r%   r   r&   r'   )
r  trainingrq   rr   r   rT   	enumerater   r/   r   )rS   r&   r   r   r   r  r   r  r   r  all_hidden_statesall_self_attentionsr   layer_modulelayer_head_masklayer_outputss                   r2   r_   zGitEncoder.forward  s    & 	"4= 	" "##p   "	 	?0*$+>>>O"6@BBD$5?bb4(44 	P 	POA|# I$58H$H!.7.CillO(L!$ M *!,M  P&9]1=M<O&O# 	E 1]4D D 
	 	 	 "#%'		 	 	 	 	 	 '+++*	
 
 
 	
r1   )NNNNFFFT)r(   r)   r*   rA   r,   rb   r   r-   r   r
   r/   r   r   r_   rc   rd   s   @r2   r   r   }  s!       , , , , , 7;15SW$(,1/4/4&*>
 >
|>
 !!23>
 E-.	>

 "%uU5;L5M/N(N"OP>
 D>>
 $D>>
 'tn>
 'tn>
 d^>
 
uU\"$;;	<>
 >
 >
 >
 >
 >
 >
 >
r1   r   c                   (    e Zd ZU eed<   dZdZd ZdS )GitPreTrainedModelrT   gitTc                    t          |t                    rt          j                            |j        d| j        j                   t          j                            |j        j	        | j        j                   t          j                            |j
        j	        | j        j                   t          |t          j                  rT|j	        j                            d| j        j                   |j         |j        j                                         dS dS t          |t          j                  r_|j	        j                            d| j        j                   |j        +|j	        j        |j                                                  dS dS t          |t          j                  r?|j        j                                         |j	        j                            d           dS dS )zInitialize the weights        )meanstd)r  Ng      ?)r   GitVisionEmbeddingsr   initnormal_class_embeddingrT   initializer_rangepatch_embeddingweightposition_embeddingrz   databiaszero_rB   r6   rI   fill_)rS   modules     r2   _init_weightsz GitPreTrainedModel._init_weights  s   f122 	aGOOF2$+B_O```GOOF29t{?\O]]]GOOF5<$+B_O```fbi(( 	* M&&CT[5R&SSS{& &&((((( '&-- 	*M&&CT[5R&SSS!-"6#56<<>>>>> .--- 	*K""$$$M$$S)))))	* 	*r1   N)r(   r)   r*   r   r.   base_model_prefixsupports_gradient_checkpointingr%  r0   r1   r2   r  r    s=         &*#* * * * *r1   r  c                   v     e Zd Zdef fdZdej        dededej        fdZdd	ej	        dej        fd
Z
 xZS )r  rT   c                 z   t                                                       || _        |j        | _        |j        | _        |j        | _        t          j        t          j
        | j                            | _        t          j        |j        | j        | j        | j        d          | _        | j        | j        z  dz  | _        | j        dz   | _        t          j        | j        | j                  | _        |                     dt          j        | j                                      d          d           d S )NF)in_channelsout_channelskernel_sizestrider!  rj   r   r;   r<   r>   )r@   rA   rT   rD   	embed_dimrv   rw   r   	Parameterr,   randnr  Conv2dnum_channelsr  num_patchesnum_positionsrB   r  rO   rP   rQ   rR   s     r2   rA   zGitVisionEmbeddings.__init__  s   + + +!|EK,G,GHH!y+? 
  
  
 !Ot>1D!-1"$,t/A4>"R"R^U\$:L-M-M-T-TU\-]-]jopppppr1   r^   heightwidthrY   c                    |j         d         dz
  }| j        j                            d          }|j         d         dz
  }t          j                                        s&||k    r ||k    r|                     | j                  S |ddddf         }|ddddf         }|j         d         }	|| j        z  }
|| j        z  }t          |dz            }|
                    d|||	          }|                    dddd          }t          j                            ||
|fdd	
          }|                    dddd                              dd|	          }t	          j        ||fd          S )a   
        This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher resolution
        images. This method is also adapted to support torch.jit tracing.

        Adapted from:
        - https://github.com/facebookresearch/dino/blob/de9ee3df6cf39fac952ab558447af1fa1365362a/vision_transformer.py#L174-L194, and
        - https://github.com/facebookresearch/dinov2/blob/e1277af2ba9496fbadf7aec6eba56e8d882d1e35/dinov2/models/vision_transformer.py#L179-L211
        r   r   Nr=   g      ?r   rj   bicubicF)r[   modealign_cornersr   )r   r  r  	unsqueezer,   jit
is_tracingr;   rw   r   reshaper   r   r   interpolater   r   )rS   r^   r5  r6  r3  r  r4  class_pos_embedpatch_pos_embedr   
new_height	new_widthsqrt_num_positionss                r2   interpolate_pos_encodingz,GitVisionEmbeddings.interpolate_pos_encoding  s    !&q)A-!4;EEaHH*03a7 y##%% 	>+*F*F6UZ??**4+<===,QQQU3,QQQU3r"t.
T_,	&}c'9::)11!5GI[]`aa)11!Q1==-33i(	 4 
 
 *11!Q1==BB1b#NNy/?;CCCCr1   Fpixel_valuesc                 <   |j         \  }}}}|s<|| j        k    s|| j        k    r&t          d| d| d| j         d| j         d	          | j        j        j        }|                     |                    |                    }|                    d                              dd          }| j	        
                    |dd          }	t          j        |	|gd	          }
|r|
|                     |
||          z   }
n|
|                     | j                  z   }
|
S )
NzInput image size (*z) doesn't match model ().r   rj   r   r=   r   )r   rv   ro   r  r  r   r   flattenr   r  rQ   r,   r   rE  r  r;   )rS   rF  rE  r   r   r5  r6  target_dtypepatch_embedsclass_embedsr^   s              r2   r_   zGitVisionEmbeddings.forward!  sD   '3'9$
Avu' 	Vt-F-F%SWSbJbJbqVqqeqqDOqq^b^mqqq   +28++LOO,O,O,OPP#++A..88A>>+22:q"EEYl;CCC
# 	Q#d&C&CJPVX]&^&^^JJ#d&=&=d>O&P&PPJr1   F)r(   r)   r*   r    rA   r,   rb   ra   rE  r-   r_   rc   rd   s   @r2   r  r    s        q q q q q q q,'D5< 'D 'DUX 'D]b]i 'D 'D 'D 'DR E$5 Z_Zf        r1   r  c                   B     e Zd Z fdZdej        dej        fdZ xZS )GitVisionMLPc                    t                                                       || _        t          |j                 | _        t          j        |j        |j	                  | _
        t          j        |j	        |j                  | _        d S r   )r@   rA   rT   r	   r   activation_fnr   rz   rD   r   fc1fc2rR   s     r2   rA   zGitVisionMLP.__init__5  sf    #F$569V/1IJJ9V5v7IJJr1   r&   rY   c                     |                      |          }|                     |          }|                     |          }|S r   )rS  rR  rT  r   s     r2   r_   zGitVisionMLP.forward<  s=    //**=99//r1   r   rd   s   @r2   rP  rP  4  sc        K K K K KU\ el        r1   rP  r  r$  r{   r|   r}   r   scalingrM   c                    t          j        ||                    dd                    |z  }|||z   }t          j                            |dt           j                                      |j                  }t          j        	                    ||| j
                  }t          j        ||          }	|	                    dd                                          }	|	|fS )Nr=   r   )r   r   )pr
  r   rj   )r,   r   r   r   r   r   float32r   r   rM   r
  r   )
r$  r{   r|   r}   r   rV  rM   kwargsattn_weightsr   s
             r2   eager_attention_forwardr\  D  s     <s}}R'<'<==GL!#n4=((2U](SSVVW\WbccL=((6?([[L,|U33K''1--88::K$$r1   c                        e Zd ZdZ fdZ	 	 	 ddej        deej                 deej                 dee         d	e	ej        eej                 f         f
d
Z
 xZS )GitVisionAttentionz=Multi-headed attention from 'Attention Is All You Need' paperc                    t                                                       || _        |j        | _        |j        | _        | j        | j        z  | _        | j        | j        z  | j        k    r t          d| j         d| j         d          | j        dz  | _	        |j
        | _        d| _        t          j        | j        | j                  | _        t          j        | j        | j                  | _        t          j        | j        | j                  | _        t          j        | j        | j                  | _        d S )Nz;embed_dim must be divisible by num_heads (got `embed_dim`: z and `num_heads`: rI  g      F)r@   rA   rT   rD   r.  rm   	num_headshead_dimro   scaleattention_dropoutrM   	is_causalr   rz   k_projv_projq_projout_projrR   s     r2   rA   zGitVisionAttention.__init__^  s   +3$.8=4>)T^;;'dn ' 'N' ' '   ]D(
/i??i??i??	$.$.AAr1   NFr&   r   causal_attention_maskr   rY   c           
      n   |j         \  }}}|                     |          }|                     |          }	|                     |          }
|                    ||| j        | j                                      dd          }|	                    ||| j        | j                                      dd          }	|
                    ||| j        | j                                      dd          }
| j        j	        dk    r||||z   }n||}n	|du| _
        t          }| j        j	        dk    rt          | j        j	                 } || ||	|
|| j
        | j        | j        sdn| j                  \  }}|                    |||                                          }|                     |          }|sd}||fS )z#Input shape: Batch x Time x Channelr   rj   flash_attention_2Nr   r  )rd  rV  rM   )r   rg  re  rf  r   r`  ra  r   rT   r   rd  r\  r   rb  r
  rM   r>  r   rh  )rS   r&   r   ri  r   r   r]   r.  querieskeysvaluesattention_interfacer   r[  s                 r2   r_   zGitVisionAttention.forwardr  s    -:,?)
J	++m,,{{=))]++,,z:t~t}UU__`acdeeyyZOOYYZ[]^__ZT^T]SS]]^_abcc ;+/BBB).C.O!/2G!G&2!62$>DN(?;+w66"9$+:Z"[$7$7nJ#}>CC$,	%
 	%
 	%
!\ "))*j)LLWWYYmmK00  	 LL((r1   )NNF)r(   r)   r*   r+   rA   r,   rb   r   r   r/   r_   rc   rd   s   @r2   r^  r^  [  s        GGB B B B B. 268<,1/) /)|/) !./)  (5	/)
 $D>/) 
u|Xel33	4/) /) /) /) /) /) /) /)r1   r^  c                        e Zd Zdef fdZ	 d
dej        dej        dej        dee         de	ej
                 f
d	Z xZS )GitVisionEncoderLayerrT   c                 D   t                                                       |j        | _        t	          |          | _        t          j        | j        |j                  | _	        t          |          | _        t          j        | j        |j                  | _        d S r   )r@   rA   rD   r.  r^  	self_attnr   rI   rJ   layer_norm1rP  mlplayer_norm2rR   s     r2   rA   zGitVisionEncoderLayer.__init__  s    ++F33<F<QRRR''<F<QRRRr1   Fr&   r   ri  r   rY   c                     |}|                      |          }|                     ||||          \  }}||z   }|}|                     |          }|                     |          }||z   }|f}|r||fz  }|S )aI  
        Args:
            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
            attention_mask (`torch.FloatTensor`): attention mask of size
                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
                `(config.encoder_attention_heads,)`.
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
        )r&   r   ri  r   )rt  rs  rv  ru  )rS   r&   r   ri  r   residualr[  outputss           r2   r_   zGitVisionEncoderLayer.forward  s    " !((77&*nn')"7/	 '5 '
 '
#| !=0 ((77// =0 " 	'&Gr1   rN  )r(   r)   r*   r    rA   r,   rb   r   r   r/   r-   r_   rc   rd   s   @r2   rq  rq    s        S S S S S S S -2& &|& &  %|	&
 $D>& 
u 	!& & & & & & & &r1   rq  c                        e Zd ZdZdef fdZe	 	 	 	 	 ddeej	                 deej	                 dee
         dee
         d	ee
         d
eeef         fd            Z xZS )GitVisionEncoderz
    Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a
    [`GitVisionEncoderLayer`].

    Args:
        config: GitVisionConfig
    rT   c                     t                                                       | _        t          j        fdt          j                  D                       | _        d| _        d S )Nc                 .    g | ]}t                    S r0   )rq  r   r   rT   s     r2   r   z-GitVisionEncoder.__init__.<locals>.<listcomp>  s"    $l$l$lq%:6%B%B$l$l$lr1   F)	r@   rA   rT   r   r   r   r   layersr  rR   s    `r2   rA   zGitVisionEncoder.__init__  sa    m$l$l$l$lERXRjLkLk$l$l$lmm&+###r1   Nr   ri  r   r  r  rY   c                 @   ||n| j         j        }||n| j         j        }||n| j         j        }|rdnd}|rdnd}|}	t	          | j                  D ]2\  }
}|r||	fz   } ||	|||          }|d         }	|r||d         fz   }3|r||	fz   }t          |	||          S )a  
        Args:
            inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
                Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
                This is useful if you want more control over how to convert `input_ids` indices into associated vectors
                than the model's internal embedding lookup matrix.
            attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

                - 1 for tokens that are **not masked**,
                - 0 for tokens that are **masked**.

                [What are attention masks?](../glossary#attention-mask)
            causal_attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
                Causal mask for the text model. Mask values selected in `[0, 1]`:

                - 1 for tokens that are **not masked**,
                - 0 for tokens that are **masked**.

                [What are attention masks?](../glossary#attention-mask)
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
            output_hidden_states (`bool`, *optional*):
                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
                for more detail.
            return_dict (`bool`, *optional*):
                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
        Nr0   )r   r   r   r%   r&   r'   )rT   r   r  use_return_dictr  r  r   )rS   rW   r   ri  r   r  r  encoder_statesall_attentionsr&   idxencoder_layerr  s                r2   r_   zGitVisionEncoder.forward  s    N 2C1N--TXT_Tq$8$D  $+Jj 	 &1%<kk$+B]3=0:d%"+DK"8"8 	F 	FC# C!/=2B!B)M%"3	  M *!,M  F!/=3C2E!E 	?+}.>>N+>Vd
 
 
 	
r1   )NNNNN)r(   r)   r*   r+   r    rA   r   r   r,   rb   r   r   r/   r   r_   rc   rd   s   @r2   r{  r{    s         , , , , , , ,  268<,0/3&*D
 D
 !.D
  (5	D

 $D>D
 'tnD
 d^D
 
uo%	&D
 D
 D
 D
 D
 D
 D
 D
r1   r{  c                        e Zd Zdef fdZe	 	 	 	 	 ddeej                 dee	         dee	         dee	         d	ee	         d
e
eef         fd            Z xZS )GitVisionTransformerrT   c                 4   t                                                       || _        |j        }t	          |          | _        t          j        ||j                  | _	        t          |          | _        t          j        ||j                  | _        d S r   )r@   rA   rT   rD   r  r^   r   rI   rJ   pre_layrnormr{  encoderpost_layernorm)rS   rT   r.  rU   s      r2   rA   zGitVisionTransformer.__init__1  s    &	-f55L8MNNN'// l9&:OPPPr1   NFrF  r   r  rE  r  rY   c                    ||n| j         j        }||n| j         j        }||n| j         j        }|t	          d          |                     ||          }|                     |          }|                     ||||          }|d         }|                     |          }|s|f|dd          z   S t          ||j
        |j                  S )Nz You have to specify pixel_valuesrE  )rW   r   r  r  r   r   r  )rT   r   r  r  ro   r^   r  r  r  r   r&   r'   )	rS   rF  r   r  rE  r  r&   encoder_outputsr%   s	            r2   r_   zGitVisionTransformer.forward;  s    2C1N--TXT_Tq$8$D  $+Jj 	 &1%<kk$+B]?@@@Oghh))-88,,'/!5#	 ' 
 
 ,A. //0ABB 	>%'/!""*===/)7&1
 
 
 	
r1   NNNFN)r(   r)   r*   r    rA   r   r   r,   r-   r   r   r/   r   r_   rc   rd   s   @r2   r  r  /  s        Q Q Q Q Q Q Q  59,0/338&*&
 &
u01&
 $D>&
 'tn	&

 #+4.&
 d^&
 
uo%	&&
 &
 &
 ^&
 &
 &
 &
 &
r1   r  zY
    The vision model from CLIP, used in GIT, without any head or projection on top.
    c                        e Zd ZU eed<   dZdef fdZdej        fdZ	e
	 	 	 	 	 ddeej                 dee         d	ee         d
edee         deeef         fd            Z xZS )GitVisionModelrT   rF  c                     t                                          |           t          |          | _        |                                  d S r   )r@   rA   r  vision_model	post_initrR   s     r2   rA   zGitVisionModel.__init__o  sA       088r1   rY   c                 $    | j         j        j        S r   )r  r^   r  rS   s    r2   get_input_embeddingsz#GitVisionModel.get_input_embeddingsu  s     +;;r1   NFr   r  rE  r  c                 V    ||n| j         j        }|                     |||||          S )a{  
        Examples:

        ```python
        >>> from PIL import Image
        >>> import requests
        >>> from transformers import AutoProcessor, GitVisionModel

        >>> processor = AutoProcessor.from_pretrained("microsoft/git-base")
        >>> model = GitVisionModel.from_pretrained("microsoft/git-base")

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> inputs = processor(images=image, return_tensors="pt")

        >>> outputs = model(**inputs)
        >>> last_hidden_state = outputs.last_hidden_state
        ```N)rF  r   r  rE  r  )rT   r  r  )rS   rF  r   r  rE  r  s         r2   r_   zGitVisionModel.forwardx  sC    8 &1%<kk$+B]  %/!5%=# ! 
 
 	
r1   r  )r(   r)   r*   r    r.   main_input_namerA   r   Moduler  r   r   r,   r-   r   r   r/   r   r_   rc   rd   s   @r2   r  r  e  s         $O      <bi < < < <  59,0/3).&*#
 #
u01#
 $D>#
 'tn	#

 #'#
 d^#
 
uo%	&#
 #
 #
 ^#
 #
 #
 #
 #
r1   r  c                   H     e Zd Zdef fdZdej        dej        fdZ xZS )GitProjectionrT   c                    t                                                       || _        t          j        t          j        |j        j        |j                  t          j        |j        |j        j	                            | _
        d S r   )r@   rA   rT   r   
Sequentialrz   ru   rD   rI   rJ   visual_projectionrR   s     r2   rA   zGitProjection.__init__  sm    !#If*68JKKL+1E1TUUU"
 "
r1   r^   rY   c                 ,    |                      |          S r   )r  )rS   r^   s     r2   r_   zGitProjection.forward  s    %%j111r1   )	r(   r)   r*   r   rA   r,   rb   r_   rc   rd   s   @r2   r  r    sj        
y 
 
 
 
 
 
2%, 25< 2 2 2 2 2 2 2 2r1   r  zy
    The bare GIT Model transformer consisting of a CLIP image encoder and text decoder outputting raw hidden-states
    c                       e Zd Z fdZd Zd Zd Zdedej	        dej
        dej        fd	ZddZe	 	 	 	 	 	 	 	 	 	 	 	 ddeej                 deej                 deej                 deej                 deej                 deej                 deeeeej                 f                  dee         dee         dee         dedee         deeej                 ef         fd            Z xZS )GitModelc                    t                                                     | _        t                    | _        t          j                  | _        t                    | _	        t                    | _        j        7t          j        fdt          j                  D                       | _        |                                  d S )Nc              3   |   K   | ]6}t          j        t          j        d d j        j                            V  7dS )r   N)r   r/  r,   zerosru   rD   r~  s     r2   r  z$GitModel.__init__.<locals>.<genexpr>  sU       ; ; U[Av/C/OPPQQ; ; ; ; ; ;r1   )r@   rA   rT   r4   r^   r  ru   image_encoderr   r  r  r  ry   r   ParameterListr   img_temporal_embeddingr  rR   s    `r2   rA   zGitModel.__init__  s       '//+F,@AA!&))!.v!6!6*6*,*: ; ; ; ;v>??; ; ; + +D' 	r1   c                     | j         j        S r   r^   rF   r  s    r2   r  zGitModel.get_input_embeddings  s    ..r1   c                     || j         _        d S r   r  )rS   r}   s     r2   set_input_embeddingszGitModel.set_input_embeddings  s    */'''r1   c                     |                                 D ]/\  }}| j        j        |         j                            |           0dS )z
        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
        class PreTrainedModel
        N)itemsr  r   r   r   )rS   heads_to_pruner   r   s       r2   _prune_headszGitModel._prune_heads  sU    
 +0022 	C 	CLE5Lu%/;;EBBBB	C 	Cr1   r[   r   r   rY   c                     t          j        t          j        ||||          d          }|                    |dk    t	          d                    }|S )Nr   r   r   )diagonal-inf)r,   triuonesmasked_fillfloat)rS   r[   r   r   masks        r2   _generate_future_maskzGitModel._generate_future_mask  sN    z%*T4eLLLWXYYY	5==99r1   Nc                 p   |j         d         }|j         d         }|j        }|j        }	t          j        ||f||	          }
t          j        |||z   ft          d          |j        |	          }t          j        ||f|	|j                  }|dk    r7t          j        |j         d         |j         d         |z   f|	|j                  }t          j        |
|fd          }t          j        ||                    |	          fd          }t          j        ||fd          d d d f         }|/t          j        |j         d         |j         d         fd|          }|j        t          j	        k    rt          d	          t          j        ||j        
          }t          d          ||<   |                    |j         d         ||z   ||z   |z   f          }|                                }|d d d d d |f         }|d d d d d f         }||z   |d d d d d |f<   |d d d d d d d f         }|S )Nr   r  r  r   r   r   F)
fill_valuer   z1Memory key padding mask must be a boolean tensor.r   )r   r   r   r,   r  fullr  r   r   r   ro   
zeros_likerQ   clone)rS   tgtmemorytgt_maskrX   memory_key_padding_masknum_tgt
num_memoryr   r   top_left	top_rightbottom_leftleftrightfull_attention_maskzero_negative_infinityorigin_leftr   s                      r2   create_attention_maskzGitModel.create_attention_mask  s   )A,\!_
	;
J7eTTTJ#99:&MM:	
 
 
	 kj!?
 
 
 "A%%{"HN1$58N$NO  H y(K0a888	9hkk%&8&89qAAA#iu1===dAAAgF"*&+j&,q/6<PQ?1S`ent&u&u&u#"(EJ66PQQQ!&!12IQTQZ![![![:?--67188$*1-zG/CZRhEhkrErs
 
 27799)!!!QQQ*;<'4
31<v1EAAAqqq+:+-. 2!!!T111aaa-@""r1   FrV   r   r;   rF  r   rW   r   r  r   r  rE  r  c                 p   |	|	n| j         j        }	|
|
n| j         j        }
||n| j         j        }||n| j         j        }||t          d          |+|                     ||           |                                }n.||                                dd         }nt          d          |d         }d}|=t          |t                    s|
                                n|
                                }|                     || j         j                  }d}||j        dk    r|                     ||          j        }n|j        d	k    rg }t!          |j        d                   D ]S}|                     |dd|ddddf         |          j        }|| j        |         z  }|                    |           Tt)          j        |d
          }nt          d          |                     |          }|                     ||||          }|:t)          j        |j        d         d|j        d         f|j        |j                  }|                    |                    d          |                    d          z  dd          }t)          j        ||fd
          }|                     ||j        |j                  }|                     ||||          }|{t=          ||j        |d                                       |j                  }|dk    r|dddd| dddf         }n*|dddd|d          d|d          dfxx         |z  cc<   |                      ||||||	|
||du	  	        }|d         }|s|f|dd         z   S tC          ||j"        |j#        |j$                  S )a  
        Examples:

        ```python
        >>> from transformers import AutoProcessor, AutoModel
        >>> import requests
        >>> from PIL import Image

        >>> processor = AutoProcessor.from_pretrained("microsoft/git-base")
        >>> model = AutoModel.from_pretrained("microsoft/git-base")

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> text = "this is an image of two cats"

        >>> inputs = processor(images=image, text=text, return_tensors="pt")

        >>> outputs = model(**inputs)
        >>> last_hidden_state = outputs.last_hidden_state
        ```NzDYou cannot specify both input_ids and inputs_embeds at the same timer=   z5You have to specify either input_ids or inputs_embedsr   r      r     r   z#pixel_values must be of rank 4 or 5)rV   r;   rW   rX   rj   r   )r  r  r  rX   )tgt_len)r   r   r   r  r   r  r  r   r	  )%rT   r   r  r  r  ro   %warn_if_padding_and_no_attention_maskr[   r   r
   get_seq_lengthget_head_maskr   ndimr  r%   r   r   r  appendr,   r   r  r^   r  r   r   repeatr  r  r   r   r  r   r   r&   r'   )rS   rV   r   r;   rF  r   rW   r   r  r   r  rE  r  r\   r]   rX   projected_visual_featuresvisual_features	frame_idxvisual_features_frameembedding_outputr&   r  combined_attention_maskexpanded_attn_maskr  sequence_outputs                              r2   r_   zGitModel.forward  s   J 2C1N--TXT_Tq$8$D  $+Jj 	 "+!6IIDK<Q	%0%<kk$+B] ]%>cddd"66y.QQQ#..**KK&',,..ss3KKTUUU ^
 "#& "/5996..000$3355 # &&y$+2OPP	$(!# A%%"&"4"4 ;S #5 # ##   "a''"$!&|'9!'<!=!= B BI,0,>,>$QQQ	111aaa%78Sk -? - -' * *T-H-SS)#**+@AAAA #()O"C"C"C !!FGGG(,(>(>(O(O%??%'#9	 + 
 
 %,(-!'*A/?/Ea/HI&,'.) ) )% %>$D$D!!!$$(A(F(Fq(I(II1a%
 %
!
 	#<>N"OUVWWW --j:J:PRbRijj #'"<"< ,#9	 #= #
 #
 % "< 0 6B" " "b!())  &))%7111?U>U>V>VXYXYXY8Y%Z""'111{1~o.?.?+a.ARAR(RSSSWiiSSS,,2+/!5#!-T!9 ' 

 

 *!, 	<#%(;;;&-+;)7&1	
 
 
 	
r1   r   )NNNNNNNNNNFN)r(   r)   r*   rA   r  r  r  ra   r,   r   r   rb   r  r  r   r   r   r
   listr-   r   r/   r   r_   rc   rd   s   @r2   r  r    s           &/ / /0 0 0C C C# ek 5< \a\h    0# 0# 0# 0#d  -115/3/3,004KO$(,0/3).&*c
 c
EL)c
 !.c
 u|,	c

 u|,c
 EL)c
  -c
 "%tE4E/F(F"GHc
 D>c
 $D>c
 'tnc
 #'c
 d^c
 
uU\"$>>	?c
 c
 c
 ^c
 c
 c
 c
 c
r1   r  z`
    GIT Model with a `language modeling` head on top for autoregressive language modeling.
    c                        e Zd ZdgZ fdZd Zd Ze	 	 	 	 	 	 	 	 	 	 	 	 	 ddee	j
                 dee	j
                 d	ee	j
                 d
ee	j
                 dee	j
                 dee	j
                 dee	j
                 deeeee	j
                 f                  dee         dee         dee         dedee         deee	j
                 ef         fd            Z	 ddZ xZS )GitForCausalLMzoutput.weightc                     t                                          |           t          |          | _        t	          j        |j        |j                  | _        | 	                                 d S r   )
r@   rA   r  r  r   rz   rD   rC   r   r  rR   s     r2   rA   zGitForCausalLM.__init__  s[       F##i 2F4EFF 	r1   c                     | j         S r   r   r  s    r2   get_output_embeddingsz$GitForCausalLM.get_output_embeddings  s
    {r1   c                     || _         d S r   r  )rS   new_embeddingss     r2   set_output_embeddingsz$GitForCausalLM.set_output_embeddings  s    $r1   NFrV   r   r;   rF  r   rW   labelsr   r  r   r  rE  r  rY   c                    ||n| j         j        }|d}	|                     ||||||||	|
|||          }|d         }|                     |          }d}|| j        j        j        d         j        j        j        }|dd|dddf         	                                }|ddddf         	                                } | j
        |                    d| j         j                  |                    d          fd| j         j        i|}|s|f|dd         z   }||f|z   n|S t          |||j        |j        |j                  S )	a  
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the left-to-right language modeling loss (next word prediction). Indices should be in
            `[-100, 0, ..., config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are
            ignored (masked), the loss is only computed for the tokens with labels n `[0, ..., config.vocab_size]`

        Examples:

        Image captioning example:

        ```python
        >>> from transformers import AutoProcessor, AutoModelForCausalLM
        >>> import requests
        >>> from PIL import Image

        >>> processor = AutoProcessor.from_pretrained("microsoft/git-base-coco")
        >>> model = AutoModelForCausalLM.from_pretrained("microsoft/git-base-coco")

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> pixel_values = processor(images=image, return_tensors="pt").pixel_values

        >>> generated_ids = model.generate(pixel_values=pixel_values, max_length=50)
        >>> generated_caption = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
        >>> print(generated_caption)
        two cats sleeping on a pink blanket next to remotes.
        ```

        Visual question answering (VQA) example:

        ```python
        >>> from transformers import AutoProcessor, AutoModelForCausalLM
        >>> from huggingface_hub import hf_hub_download
        >>> from PIL import Image

        >>> processor = AutoProcessor.from_pretrained("microsoft/git-base-textvqa")
        >>> model = AutoModelForCausalLM.from_pretrained("microsoft/git-base-textvqa")

        >>> file_path = hf_hub_download(repo_id="nielsr/textvqa-sample", filename="bus.png", repo_type="dataset")
        >>> image = Image.open(file_path).convert("RGB")

        >>> pixel_values = processor(images=image, return_tensors="pt").pixel_values

        >>> question = "what does the front of the bus say at the top?"

        >>> input_ids = processor(text=question, add_special_tokens=False).input_ids
        >>> input_ids = [processor.tokenizer.cls_token_id] + input_ids
        >>> input_ids = torch.tensor(input_ids).unsqueeze(0)

        >>> generated_ids = model.generate(pixel_values=pixel_values, input_ids=input_ids, max_length=50)
        >>> print(processor.batch_decode(generated_ids, skip_special_tokens=True))
        ['what does the front of the bus say at the top? special']
        ```

        Video captioning example:

        ```python
        >>> import av
        >>> import numpy as np
        >>> from PIL import Image
        >>> from huggingface_hub import hf_hub_download
        >>> from transformers import AutoProcessor, AutoModelForCausalLM

        >>> processor = AutoProcessor.from_pretrained("microsoft/git-base-vatex")
        >>> model = AutoModelForCausalLM.from_pretrained("microsoft/git-base-vatex")

        >>> # set seed for reproducibility
        >>> np.random.seed(45)


        >>> def read_video_pyav(container, indices):
        ...     '''
        ...     Decode the video with PyAV decoder.
        ...     Args:
        ...         container (`av.container.input.InputContainer`): PyAV container.
        ...         indices (`list[int]`): List of frame indices to decode.
        ...     Returns:
        ...         result (np.ndarray): np array of decoded frames of shape (num_frames, height, width, 3).
        ...     '''
        ...     frames = []
        ...     container.seek(0)
        ...     start_index = indices[0]
        ...     end_index = indices[-1]
        ...     for i, frame in enumerate(container.decode(video=0)):
        ...         if i > end_index:
        ...             break
        ...         if i >= start_index and i in indices:
        ...             frames.append(frame)
        ...     return np.stack([x.to_ndarray(format="rgb24") for x in frames])


        >>> def sample_frame_indices(clip_len, frame_sample_rate, seg_len):
        ...     '''
        ...     Sample a given number of frame indices from the video.
        ...     Args:
        ...         clip_len (`int`): Total number of frames to sample.
        ...         frame_sample_rate (`int`): Sample every n-th frame.
        ...         seg_len (`int`): Maximum allowed index of sample's last frame.
        ...     Returns:
        ...         indices (`list[int]`): List of sampled frame indices
        ...     '''
        ...     converted_len = int(clip_len * frame_sample_rate)
        ...     end_idx = np.random.randint(converted_len, seg_len)
        ...     start_idx = end_idx - converted_len
        ...     indices = np.linspace(start_idx, end_idx, num=clip_len)
        ...     indices = np.clip(indices, start_idx, end_idx - 1).astype(np.int64)
        ...     return indices


        >>> # load video
        >>> file_path = hf_hub_download(
        ...     repo_id="nielsr/video-demo", filename="eating_spaghetti.mp4", repo_type="dataset"
        ... )
        >>> container = av.open(file_path)

        >>> # sample frames
        >>> num_frames = model.config.num_image_with_embedding
        >>> indices = sample_frame_indices(
        ...     clip_len=num_frames, frame_sample_rate=4, seg_len=container.streams.video[0].frames
        ... )
        >>> frames = read_video_pyav(container, indices)

        >>> pixel_values = processor(images=list(frames), return_tensors="pt").pixel_values

        >>> generated_ids = model.generate(pixel_values=pixel_values, max_length=50)

        >>> print("Generated caption:", processor.batch_decode(generated_ids, skip_special_tokens=True))
        Generated caption: ['a woman is sitting at a table and she is talking about the food she is holding.']
        ```
        NF)r   r;   rF  r   rW   r   r  r   r  rE  r  r   r=   r   rC   )losslogitsr   r&   r'   )rT   r  r  r   r  r   r   rS   rx   r   loss_functionr   rC   r   r   r&   r'   )rS   rV   r   r;   rF  r   rW   r  r   r  r   r  rE  r  rZ  ry  r  r  r  num_image_tokensshifted_logitsr   s                         r2   r_   zGitForCausalLM.forward  s   j &1%<kk$+B]I(()%%'+/!5%=#  
 
 "!*_--#x/5a8BGZ#AAA'7':AAA$=>IIKKNAAAqrrE]--//F%4%##B(>??B   ;1 	 D  	FY,F)-)9TGf$$vE%#3!/)
 
 
 	
r1   c                 N   |F|                                 }|j        d         |k    r|}n|j        d         dz
  }|d d |d f         }|j        }||                    |          }|||                    d          ||d}	|                                D ]\  }
}|
|	vr||	|
<   |	S )Nr   rF  )rV   r   rF  r   r  )r  r   new_onesgetr  )rS   rV   r   r   r  rZ  past_lengthremove_prefix_lengthr\   model_inputsr|   r}   s               r2   prepare_inputs_for_generationz,GitForCausalLM.prepare_inputs_for_generation  s     &)88::K q!K//'2$$ (1q'9A'=$!!!!%9%:%:":;I  o!&//<<N #,"JJ~66."
 
 !,,.. 	* 	*JC,&&$)S!r1   )NNNNNNNNNNNFN)NNN)r(   r)   r*   _tied_weights_keysrA   r  r  r   r   r,   rb   r   r
   r  r   r/   r   r_   r  rc   rd   s   @r2   r  r    s        **      % % %  -115/3/3,004)-FJ$(,0/3).&*A
 A
EL)A
 !.A
 u|,	A

 u|,A
 EL)A
  -A
 &A
 "%tEL/A(A"BCA
 D>A
 $D>A
 'tnA
 #'A
 d^A
  
uU\"$::	;!A
 A
 A
 ^A
H OS$ $ $ $ $ $ $ $r1   r  )r  r  r  r  )r  )Ir+   r   dataclassesr   typingr   r   r   r,   r   activationsr	   cache_utilsr
   r   
generationr   modeling_attn_mask_utilsr   modeling_layersr   modeling_outputsr   r   r   r   modeling_utilsr   r   pytorch_utilsr   r   r   utilsr   r   r   r   r   utils.deprecationr   configuration_gitr   r    
get_loggerr(   rq   r#   r  r4   rf   r   r   r   r   r   r   r   r  r  rP  rb   r  r\  r^  rq  r{  r  r  r  r  r  __all__r0   r1   r2   <module>r
     s       ! ! ! ! ! ! , , , , , , , , , ,        ! ! ! ! ! ! . . . . . . . . ) ) ) ) ) ) B B B B B B 9 9 9 9 9 9            G F F F F F F F l l l l l l l l l l              1 0 0 0 0 0 9 9 9 9 9 9 9 9 
	H	%	%   	? 	? 	? 	? 	?; 	? 	?  	?- - - - -BI - - -`v. v. v. v. v.ry v. v. v.t    BI     
/3 /3 /3 /3 /329 /3 /3 /3f    bi        	   % % % % %) % % %PE
 E
 E
 E
 E
 E
 E
 E
P * * * * * * * *6P P P P P") P P Pf    29   . % %I%<% 
% <	%
 U\*% % % % % %.F) F) F) F) F) F) F) F)T/ / / / /6 / / /fT
 T
 T
 T
 T
ry T
 T
 T
n3
 3
 3
 3
 3
29 3
 3
 3
l   
2
 2
 2
 2
 2
' 2
 2
 
2
j
2 
2 
2 
2 
2BI 
2 
2 
2   
~
 ~
 ~
 ~
 ~
! ~
 ~
 
~
B   
z z z z z' z z 
zz Q
P
Pr1   