
     `i                        d Z ddlZddlZddlmZ ddlmZmZ ddl	Z	ddl	m
Z
 ddlmZ ddlmZ dd	lmZ dd
lmZmZmZmZmZmZ ddlmZ ddlmZmZmZ ddlmZm Z  ddl!m"Z"  e j#        e$          Z%e ed           G d de                                  Z& G d de
j'                  Z( G d de
j'                  Z) G d de
j'                  Z* G d de
j'                  Z+ G d de
j'                  Z, G d de
j'                  Z- G d  d!e
j'                  Z. G d" d#e
j'                  Z/ G d$ d%e          Z0 G d& d'e
j'                  Z1e G d( d)e                      Z2e G d* d+e2                      Z3 G d, d-e
j'                  Z4 ed.           G d/ d0e2                      Z5 G d1 d2e
j'                  Z6 G d3 d4e
j'                  Z7 ed5           G d6 d7e2                      Z8 ed8           G d9 d:e2                      Z9 ed;           G d< d=e2                      Z:e G d> d?e2                      Z;g d@Z<dS )AzPyTorch ViLT model.    N)	dataclass)OptionalUnion)nn)CrossEntropyLoss   )ACT2FN)GradientCheckpointingLayer)BaseModelOutputBaseModelOutputWithPoolingMaskedLMOutputModelOutputSequenceClassifierOutputTokenClassifierOutput)PreTrainedModel) find_pruneable_heads_and_indicesmeshgridprune_linear_layer)auto_docstringlogging   )
ViltConfigzF
    Class for outputs of [`ViltForImagesAndTextClassification`].
    )custom_introc                       e Zd ZU dZdZeej                 ed<   dZ	eej                 ed<   dZ
eeeej                                   ed<   dZeeeej                                   ed<   dS )(ViltForImagesAndTextClassificationOutputa7  
    loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
        Classification (or regression if config.num_labels==1) loss.
    logits (`torch.FloatTensor` of shape `(batch_size, config.num_labels)`):
        Classification (or regression if config.num_labels==1) scores (before SoftMax).
    hidden_states (`list[tuple(torch.FloatTensor)]`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
        List of tuples of `torch.FloatTensor` (one for each image-text pair, each tuple containing the output of
        the embeddings + one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
        Hidden-states of the model at the output of each layer plus the initial embedding outputs.
    Nlosslogitshidden_states
attentions)__name__
__module____qualname____doc__r   r   torchFloatTensor__annotations__r   r   listtupler        z/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/transformers/models/vilt/modeling_vilt.pyr   r   -   s         	 	 )-D(5$
%,,,*.FHU&'...>BM8Du'8!9:;BBB;?JeE$5678?????r*   r   c                   4     e Zd ZdZ fdZddZ	 ddZ xZS )	ViltEmbeddingsz
    Construct the text and patch embeddings.

    Text embeddings are equivalent to BERT embeddings.

    Patch embeddings are equivalent to ViT embeddings.
    c                    t                                                       t          |          | _        t	          j        t          j        dd|j                            | _	        t          |          | _        | j        j        }t	          j        t          j        d|dz   |j                            | _        t	          j        |j        |j                  | _        t	          j        |j                  | _        || _        d S Nr   )super__init__TextEmbeddingstext_embeddingsr   	Parameterr$   zeroshidden_size	cls_tokenViltPatchEmbeddingspatch_embeddingsnum_patchesposition_embeddings	Embeddingmodality_type_vocab_sizetoken_type_embeddingsDropouthidden_dropout_probdropoutconfig)selfrB   r:   	__class__s      r+   r1   zViltEmbeddings.__init__N   s      .f55ek!Q8J&K&KLL 3F ; ;+7#%<A{QPVPb0c0c#d#d %'\&2QSYSe%f%f"z&"<==r*      c           	         !"# | j         j        j        j        \  }}}}|                      |          }|d d d d d d d f                                         }t
          j                            ||j        d         |j        d         f                                          }|d d df         	                    d          d d df         }	|d d df         	                    d          d d df         }
|j        \  }}#| j
        j        | j
        j        z  }| j        d d dd d d f                             dd                              d|||          !t!          j        !#fdt%          |	|
          D             d          }|                    d                              dd          }|                    d                              dd          }t!          j        t+          t!          j        |j        d                   t!          j        |j        d	                   d
          d	                              |j                  }|d d d d d d d d f         }|                    |j        d         |j        d         d	d	d	          }|                    dd          }|                    d          }dk     st5          t6                    s|	|
z  }|                                n'|	|
z  }t;          |                                          |                    d          "d|z
                      d           "d d df                                         }"fd|D             } fd|D             }d |D             }d |D             }fd|D             }g }tA          t%          |||                    D ]\  }\  }}}|dk    r[t!          j!        t!          j"        |                                                    }|#                    ||         |                    jt!          j!        t!          j"        |                                          |d          }|#                    t!          j        ||         ||         |         gd                     t!          j        |d          }||d d df         |d d df         f                             |d	|          }||d d df         |d d df         f                             |d	          }||d d df         |d d df         f                             |d	d          }||d d df         |d d df         f                             |d	|          }| j$                            |d	d	          }t!          j        ||fd          }t!          j        | j        d d dd d f         d d d d d f                             |d	d	          |fd          }||z   }| %                    |          }t!          j        t!          j"        |j        d         d                              |          |gd          }|||#fffS )N   r   sizer   r   dimc           
          g | ]R\  }}t           j                            t           j                            ||fd d          d|z
  d|z
  f          SS )bilinearT)rI   modealign_cornersr   )r   
functionalpadinterpolate).0hwheightspatial_poswidths      r+   
<listcomp>z/ViltEmbeddings.visual_embed.<locals>.<listcomp>j   s        Aq !!M--#V'&*	 .   	1fqj1   r*   ij)indexingdeviceF)as_tuplec                 <    g | ]}d d df         |k             S Nr   r)   )rS   u	valid_idxs     r+   rY   z/ViltEmbeddings.visual_embed.<locals>.<listcomp>   s/    NNNQ9QQQT?a#78NNNr*   c                 <    g | ]}d d df         |k             S rb   r)   )rS   rc   non_valid_idxs     r+   rY   z/ViltEmbeddings.visual_embed.<locals>.<listcomp>   s0    ZZZ]=A+>!+CDZZZr*   c                 8    g | ]}|                     d           S r   rH   rS   vs     r+   rY   z/ViltEmbeddings.visual_embed.<locals>.<listcomp>   s"    777AaffQii777r*   c                 8    g | ]}|                     d           S rh   rH   ri   s     r+   rY   z/ViltEmbeddings.visual_embed.<locals>.<listcomp>   s"    ???!&&))???r*   c                     g | ]}|z
  S r)   r)   )rS   rj   max_image_lengths     r+   rY   z/ViltEmbeddings.visual_embed.<locals>.<listcomp>   s    ===Q$q(===r*   T)replacement)&r9   
projectionweightshapefloatr   rP   rR   longsumrB   
image_size
patch_sizer;   	transposeviewr$   catzipflattenstackr   arangetor_   expand
isinstanceintmaxminnonzerounique	enumeratemultinomialonesappendr7   rA   )$rC   pixel_values
pixel_maskrm   _phpwxx_maskx_hx_w
batch_sizenum_channels	patch_dim	pos_embedpatch_indexeffective_resolutionunique_rowsvalid_row_idxnon_valid_row_idx
valid_numsnon_valid_numspad_numsselectirj   nvpvalid_choice
pad_choice
cls_tokensrV   rf   rW   rd   rX   s$      `                           @@@@@r+   visual_embedzViltEmbeddings.visual_embed]   s   ,7>D1b"!!,//AAAtQQQM*0022**6QWQZ8P*QQVVXXQQQTl1%%aaad+QQQTl1%%aaad+23'/
L&%K*dk.DD	.qqq!""aaax8BB1aHHMMaQ]_hjsttI       SMM   
 
 
	  %%a((221a88	IIaLL""1a((kU\&,r"233U\&,rBR5S5S^bcccik
 
 

"FM"
"
" 	 "$aaaAAA"56!((a&,q/2rSUVV!))!Q//""a#3#;:N^`cCdCd#;
 $'9 37799#&9 "#7#;#;#=#=?OPPNNEN22	V,,e,<<1o,,..NNNN+NNNZZZZkZZZ77777
??->???====*===&s:~x'P'PQQ 	f 	fMAz2qAvv$0A1D1D1F1FHXYYmA.|<====".uz"~~/C/C/E/EqVZ[[[
eiq)9;LQ;OPZ;[(\bcdddeeee6q)))fQQQTlF111a4L()..z2|LLqqq!tfQQQTl2388RHH!&A,qqq!t"<=BB:rSTUUfQQQTlF111a4L89>>z2|\\	^**:r2>>
Iz1o1---I%aaaAAAg.qqq$z:AA*bRTUUW`agh
 
 
	 	MLLOOEJv|A::==fEEvNTUVVV&;888r*   r   c	           	         |                      |||          }	|'|                     ||| j        j                  \  }}
}n|                    d          }
|d}|	|                     t          j        |t          j        |	j	                            z   }	||                     t          j
        |
|t          j        |	j	                            z   }t          j        |	|gd          }t          j        ||
gd          }||fS )N)	input_idstoken_type_idsinputs_embeds)rm   r   dtyper_   rJ   )r3   r   rB   rm   r{   r>   r$   
zeros_likers   r_   	full_likery   )rC   r   attention_maskr   r   r   r   image_embedsimage_token_type_idxtext_embedsimage_masksr   
embeddingsmaskss                 r+   forwardzViltEmbeddings.forward   s0    **m + 
 

 595F5Fj4;;W 6G 6 62L+{{ %,,Q//K  '#$ !D$>$>^5:kFXYYY%
 %
 
 $d&@&@OK)=UZXcXjkkk'
 '
 

 Y\:BBB
	>;7Q???5  r*   )rE   )r   )r    r!   r"   r#   r1   r   r   __classcell__rD   s   @r+   r-   r-   E   st             V9 V9 V9 V9B '! '! '! '! '! '! '! '!r*   r-   c                   *     e Zd ZdZ fdZddZ xZS )r2   zGConstruct the embeddings from word, position and token_type embeddings.c                    t                                                       t          j        |j        |j        |j                  | _        t          j        |j        |j                  | _	        t          j        |j
        |j                  | _        t          j        |j        |j                  | _        t          j        |j                  | _        t#          |dd          | _        |                     dt)          j        |j                                      d          d           |                     d	t)          j        | j                                        t(          j        
          d           d S )N)padding_idxepsposition_embedding_typeabsoluteposition_ids)r   r[   F)
persistentr   r   )r0   r1   r   r<   
vocab_sizer6   pad_token_idword_embeddingsmax_position_embeddingsr;   type_vocab_sizer>   	LayerNormlayer_norm_epsr?   r@   rA   getattrr   register_bufferr$   r}   r   r5   r   rI   rs   rC   rB   rD   s     r+   r1   zTextEmbeddings.__init__   sK   !|F,=v?Q_e_rsss#%<0NPVPb#c#c %'\&2H&J\%]%]" f&8f>STTTz&"<=='.v7PR\']']$EL)GHHOOPWXXej 	 	
 	
 	
 	ek$*;*@*@*B*B%*UUUbg 	 	
 	
 	
 	
 	
r*   Nc                    ||                                 }n|                                 d d         }|d         }|| j        d d d |f         }|mt          | d          r2| j        d d d |f         }|                    |d         |          }|}n+t          j        |t
          j        | j        j                  }|| 	                    |          }| 
                    |          }	||	z   }
| j        dk    r|                     |          }|
|z  }
|                     |
          }
|                     |
          }
|
S )Nr[   r   r   r   r   r   )rI   r   hasattrr   r   r$   r5   rs   r_   r   r>   r   r;   r   rA   )rC   r   r   r   r   input_shape
seq_lengthbuffered_token_type_ids buffered_token_type_ids_expandedr>   r   r;   s               r+   r   zTextEmbeddings.forward   sb    #..**KK',,..ss3K ^
,QQQ^<L
 !t-.. m*.*=aaa*n*M'3J3Q3QR]^_R`bl3m3m0!A!&[
SWSdSk!l!l!l  00;;M $ : :> J J"%::
':55"&":":<"H"H--J^^J//
\\*--
r*   )NNNNr    r!   r"   r#   r1   r   r   r   s   @r+   r2   r2      sR        QQ
 
 
 
 
&               r*   r2   c                   (     e Zd ZdZ fdZd Z xZS )r8   z#
    Image to Patch Embedding.
    c                    t                                                       |j        |j        }}|j        |j        }}t          |t          j        j	                  r|n||f}t          |t          j        j	                  r|n||f}|d         |d         z  |d         |d         z  z  }|| _        || _        || _        || _
        t          j        ||||          | _        d S )Nr   r   )kernel_sizestride)r0   r1   ru   rv   r   r6   r   collectionsabcIterabler:   r   Conv2dro   )rC   rB   ru   rv   r   r6   r:   rD   s          r+   r1   zViltPatchEmbeddings.__init__  s    !'!2F4EJ
$*$79Kk#-j+/:R#S#SqZZZdfpYq
#-j+/:R#S#SqZZZdfpYq
!!}
15*Q-:VW=:XY$$(&)L+:^hiiir*   c                     |j         \  }}}}|| j        k    rt          d          | j        j        j        }|                     |                    |                    }|S )NzeMake sure that the channel dimension of the pixel values match with the one set in the configuration.r   )rq   r   
ValueErrorro   rp   r   r~   )rC   r   r   r   rV   rX   target_dtyper   s           r+   r   zViltPatchEmbeddings.forward,  si    2>2D/
L&%4,,,w   -3OOLOO,O??@@r*   r   r   s   @r+   r8   r8     sV         j j j j j      r*   r8   c                   &     e Zd Z fdZddZ xZS )ViltSelfAttentionc                    t                                                       |j        |j        z  dk    r0t	          |d          s t          d|j         d|j         d          |j        | _        t          |j        |j        z            | _        | j        | j        z  | _        t          j
        |j        | j        |j                  | _        t          j
        |j        | j        |j                  | _        t          j
        |j        | j        |j                  | _        t          j        |j                  | _        d S )Nr   embedding_sizezThe hidden size z4 is not a multiple of the number of attention heads .bias)r0   r1   r6   num_attention_headsr   r   r   attention_head_sizeall_head_sizer   Linearqkv_biasquerykeyvaluer?   attention_probs_dropout_probrA   r   s     r+   r1   zViltSelfAttention.__init__8  s.    ::a??PVXhHiHi?76#5 7 737 7 7  
 $*#= #&v'9F<V'V#W#W !58PPYv143EFO\\\
9V/1C&/ZZZYv143EFO\\\
z&"EFFr*   NFc                    |j         \  }}}|                     |                              |d| j        | j                                      dd          }|                     |                              |d| j        | j                                      dd          }	|                     |                              |d| j        | j                                      dd          }
t          j	        ||	                    dd                    }|t          j        | j                  z  }|||z   } t          j        d          |          }|                     |          }|||z  }t          j	        ||
          }|                    dddd                                          }|                                d d         | j        fz   } |j        | }|r||fn|f}|S )Nr[   r   rG   rZ   rJ   r   r   )rq   r   rx   r   r   rw   r   r   r$   matmulmathsqrtr   SoftmaxrA   permute
contiguousrI   r   )rC   r   r   	head_maskoutput_attentionsr   r   r   query_layer	key_layervalue_layerattention_scoresattention_probscontext_layernew_context_layer_shapeoutputss                   r+   r   zViltSelfAttention.forwardJ  s   $1$7!
JJJ}%%T*b$":D<TUUYq!__ 	 HH]##T*b$":D<TUUYq!__ 	 JJ}%%T*b$":D<TUUYq!__ 	 !<Y5H5HR5P5PQQ+di8P.Q.QQ%/.@ -"*,,,-=>> ,,77  -	9O_kBB%--aAq99DDFF"/"4"4"6"6ss";t?Q>S"S**,CD6G]=/22mM]r*   NNFr    r!   r"   r1   r   r   r   s   @r+   r   r   7  sQ        G G G G G$, , , , , , , ,r*   r   c                   Z     e Zd ZdZdef fdZdej        dej        dej        fdZ xZ	S )ViltSelfOutputz
    The residual connection is defined in ViltLayer instead of here (as is the case with other models), due to the
    layernorm applied before each block.
    rB   c                     t                                                       t          j        |j        |j                  | _        t          j        |j                  | _        d S N)	r0   r1   r   r   r6   denser?   r@   rA   r   s     r+   r1   zViltSelfOutput.__init__  sJ    Yv163EFF
z&"<==r*   r   input_tensorreturnc                 Z    |                      |          }|                     |          }|S r  r  rA   rC   r   r  s      r+   r   zViltSelfOutput.forward  s*    

=11]33r*   )
r    r!   r"   r#   r   r1   r$   Tensorr   r   r   s   @r+   r  r  z  s         
>z > > > > > >
U\  RWR^        r*   r  c                   ,     e Zd Z fdZd ZddZ xZS )ViltAttentionc                     t                                                       t          |          | _        t	          |          | _        t                      | _        d S r  )r0   r1   r   	attentionr  outputsetpruned_headsr   s     r+   r1   zViltAttention.__init__  sI    *622$V,,EEr*   c                    t          |          dk    rd S t          || j        j        | j        j        | j                  \  }}t          | j        j        |          | j        _        t          | j        j        |          | j        _        t          | j        j	        |          | j        _	        t          | j
        j        |d          | j
        _        | j        j        t          |          z
  | j        _        | j        j        | j        j        z  | j        _        | j                            |          | _        d S )Nr   r   rJ   )lenr   r  r   r   r  r   r   r   r   r  r  r   union)rC   headsindexs      r+   prune_headszViltAttention.prune_heads  s   u::??F74>5t~7Y[_[l
 
u
  2$.2FNN/0BEJJ1$.2FNN.t{/@%QOOO .2^-ORUV[R\R\-\*'+~'IDNLn'n$ -33E::r*   NFc                     |                      ||||          }|                     |d         |          }|f|dd          z   }|S )Nr   r   )r  r  )rC   r   r   r   r   self_outputsattention_outputr   s           r+   r   zViltAttention.forward  sM    ~~m^YPabb;;|AFF#%QRR(88r*   r   )r    r!   r"   r1   r  r   r   r   s   @r+   r  r    s[        " " " " "; ; ;$       r*   r  c                   H     e Zd Zdef fdZdej        dej        fdZ xZS )ViltIntermediaterB   c                    t                                                       t          j        |j        |j                  | _        t          |j        t                    rt          |j                 | _        d S |j        | _        d S r  )r0   r1   r   r   r6   intermediate_sizer  r   
hidden_actstrr	   intermediate_act_fnr   s     r+   r1   zViltIntermediate.__init__  sn    Yv163KLL
f'-- 	9'-f.?'@D$$$'-'8D$$$r*   r   r  c                 Z    |                      |          }|                     |          }|S r  )r  r#  rC   r   s     r+   r   zViltIntermediate.forward  s,    

=1100??r*   	r    r!   r"   r   r1   r$   r  r   r   r   s   @r+   r  r    sj        9z 9 9 9 9 9 9U\ el        r*   r  c                   V     e Zd Zdef fdZdej        dej        dej        fdZ xZS )
ViltOutputrB   c                     t                                                       t          j        |j        |j                  | _        t          j        |j                  | _	        d S r  )
r0   r1   r   r   r   r6   r  r?   r@   rA   r   s     r+   r1   zViltOutput.__init__  sJ    Yv79KLL
z&"<==r*   r   r  r  c                 d    |                      |          }|                     |          }||z   }|S r  r
  r  s      r+   r   zViltOutput.forward  s4    

=11]33%4r*   r&  r   s   @r+   r(  r(    su        >z > > > > > >
U\  RWR^        r*   r(  c                   *     e Zd ZdZ fdZddZ xZS )	ViltLayerz?This corresponds to the Block class in the timm implementation.c                 z   t                                                       |j        | _        d| _        t	          |          | _        t          |          | _        t          |          | _	        t          j        |j        |j                  | _        t          j        |j        |j                  | _        d S )Nr   r   )r0   r1   chunk_size_feed_forwardseq_len_dimr  r  r  intermediater(  r  r   r   r6   r   layernorm_beforelayernorm_afterr   s     r+   r1   zViltLayer.__init__  s    '-'E$&v..,V44 (( "V-?VEZ [ [ [!|F,>FDYZZZr*   NFc                 H   |                      |                     |          |||          }|d         }|dd          }||                    |j                  z   }|                     |          }|                     |          }|                     ||          }|f|z   }|S )N)r   r   r   )r  r1  r~   r_   r2  r0  r  )	rC   r   r   r   r   self_attention_outputsr  r   layer_outputs	            r+   r   zViltLayer.forward  s    !%!!-00/	 "0 "
 "
 2!4(, )=+;+;<L<S+T+TT ++M::((66 {{<??/G+r*   r   r   r   s   @r+   r,  r,    sW        II[ [ [ [ [       r*   r,  c                   0     e Zd Z fdZ	 	 	 	 	 ddZ xZS )ViltEncoderc                     t                                                       | _        t          j        fdt          j                  D                       | _        d| _        d S )Nc                 .    g | ]}t                    S r)   )r,  )rS   r   rB   s     r+   rY   z(ViltEncoder.__init__.<locals>.<listcomp>  s!    #_#_#_!If$5$5#_#_#_r*   F)	r0   r1   rB   r   
ModuleListrangenum_hidden_layerslayergradient_checkpointingr   s    `r+   r1   zViltEncoder.__init__  s`    ]#_#_#_#_uVE]?^?^#_#_#_``
&+###r*   NFTc                 .   |rdnd }|rdnd }t          | j                  D ]=\  }	}
|r||fz   }|||	         nd } |
||||          }|d         }|r||d         fz   }>|r||fz   }|st          d |||fD                       S t          |||          S )Nr)   r   r   c              3      K   | ]}||V  	d S r  r)   ri   s     r+   	<genexpr>z&ViltEncoder.forward.<locals>.<genexpr>  s(      mmq_`_l_l_l_l_lmmr*   )last_hidden_stater   r   )r   r=  r(   r   )rC   r   r   r   r   output_hidden_statesreturn_dictall_hidden_statesall_self_attentionsr   layer_modulelayer_head_masklayer_outputss                r+   r   zViltEncoder.forward  s    #7@BBD$5?bb4(44 	P 	POA|# I$58H$H!.7.CillO(LYjkkM)!,M  P&9]1=M<O&O# 	E 1]4D D 	nmm]4EGZ$[mmmmmm++*
 
 
 	
r*   )NNFFTr  r   s   @r+   r7  r7    s]        , , , , , ""
 "
 "
 "
 "
 "
 "
 "
r*   r7  c                   0    e Zd ZU eed<   dZdZddgZd ZdS )ViltPreTrainedModelrB   viltTr-   r   c                    t          |t          j        t          j        f          rT|j        j                            d| j        j                   |j	         |j	        j        
                                 dS dS t          |t          j                  r_|j        j                            d| j        j                   |j        +|j        j        |j                 
                                 dS dS t          |t          j                  r?|j	        j        
                                 |j        j                            d           dS dS )zInitialize the weightsg        )meanstdNg      ?)r   r   r   r   rp   datanormal_rB   initializer_ranger   zero_r<   r   r   fill_)rC   modules     r+   _init_weightsz!ViltPreTrainedModel._init_weights$  s0   fry")455 	* M&&CT[5R&SSS{& &&((((( '&-- 	*M&&CT[5R&SSS!-"6#56<<>>>>> .--- 	*K""$$$M$$S)))))	* 	*r*   N)	r    r!   r"   r   r&   base_model_prefixsupports_gradient_checkpointing_no_split_modulesrV  r)   r*   r+   rK  rK    sI         &*#)+>?* * * * *r*   rK  c                       e Zd Zd fd	Zd Zd Zd Ze	 	 	 	 	 	 	 	 	 	 	 	 ddee	j
                 dee	j                 d	ee	j
                 d
ee	j                 dee	j
                 dee	j                 dee	j                 dee	j                 dee         dee         dee         dee         deeee	j                 f         fd            Z xZS )	ViltModelTc                 J   t                                          |           || _        t          |          | _        t          |          | _        t          j        |j	        |j
                  | _        |rt          |          nd| _        |                                  dS )zv
        add_pooling_layer (bool, *optional*, defaults to `True`):
            Whether to add a pooling layer
        r   N)r0   r1   rB   r-   r   r7  encoderr   r   r6   r   	layernorm
ViltPoolerpooler	post_init)rC   rB   add_pooling_layerrD   s      r+   r1   zViltModel.__init__7  s    
 	   (00"6**f&8f>STTT,=Gj(((4 	r*   c                 $    | j         j        j        S r  r   r3   r   rC   s    r+   get_input_embeddingszViltModel.get_input_embeddingsH  s    .>>r*   c                 (    || j         j        _        d S r  rd  )rC   r   s     r+   set_input_embeddingszViltModel.set_input_embeddingsK  s    :?'777r*   c                     |                                 D ]/\  }}| j        j        |         j                            |           0dS )z
        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
        class PreTrainedModel
        N)itemsr]  r=  r  r  )rC   heads_to_pruner=  r  s       r+   _prune_headszViltModel._prune_headsN  sU    
 +0022 	C 	CLE5Lu%/;;EBBBB	C 	Cr*   Nr   r   r   r   r   r   r   r   r   r   rC  rD  r  c           
      \   |
|
n| j         j        }
||n| j         j        }||n| j         j        }||t	          d          |+|                     ||           |                                }n.||                                dd         }nt	          d          |\  }}||j        n|j        }|t          j	        ||f|          }||t	          d          ||t	          d          ||j
        d         n|j
        d         }||k    rt	          d	          |-t          j	        || j         j        | j         j        f|          }|                     || j         j                  }|                     ||||||||	
          \  }}|                     ||          }|                     ||||
||          }|d         }|                     |          }| j        |                     |          nd}|s||f|dd         z   S t'          |||j        |j                  S )ak  
        image_embeds (`torch.FloatTensor` of shape `(batch_size, num_patches, hidden_size)`, *optional*):
            Optionally, instead of passing `pixel_values`, you can choose to directly pass an embedded representation.
            This is useful if you want more control over how to convert `pixel_values` into patch embeddings.
        image_token_type_idx (`int`, *optional*):
            - The token type ids for images.

        Examples:

        ```python
        >>> from transformers import ViltProcessor, ViltModel
        >>> from PIL import Image
        >>> import requests

        >>> # prepare image and text
        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)
        >>> text = "hello world"

        >>> processor = ViltProcessor.from_pretrained("dandelin/vilt-b32-mlm")
        >>> model = ViltModel.from_pretrained("dandelin/vilt-b32-mlm")

        >>> inputs = processor(image, text, return_tensors="pt")
        >>> outputs = model(**inputs)
        >>> last_hidden_states = outputs.last_hidden_state
        ```NzDYou cannot specify both input_ids and inputs_embeds at the same timer[   z5You have to specify either input_ids or inputs_embedsr^   zFYou cannot specify both pixel_values and image_embeds at the same timez7You have to specify either pixel_values or image_embedsr   zAThe text inputs and image inputs need to have the same batch size)r   )r   r   r   rC  rD  r   )rB  pooler_outputr   r   )rB   r   rC  use_return_dictr   %warn_if_padding_and_no_attention_maskrI   r_   r$   r   rq   ru   get_head_maskr<  r   get_extended_attention_maskr]  r^  r`  r   r   r   )rC   r   r   r   r   r   r   r   r   r   r   rC  rD  r   text_batch_sizer   r_   image_batch_sizeembedding_outputextended_attention_maskencoder_outputssequence_outputpooled_outputs                          r+   r   zViltModel.forwardV  s   T 2C1N--TXT_Tq$8$D  $+Jj 	 &1%<kk$+B] ]%>cddd"66y.QQQ#..**KK&',,..ss3KKTUUU&1#%.%:!!@T!"Z/:)FPVWWWN#(@efff!l&:VWWW4@4L<-a00R^RdefRg..`aaa%5t{7Mt{Oe$fouvvvJ &&y$+2OPP	+/??!5 ,; 	,
 	,
(. 150P0PQ_al0m0m,,2/!5# ' 
 
 *!,..998<8OO444UY 	J#]3oabb6III)-')7&1	
 
 
 	
r*   )TNNNNNNNNNNNN)r    r!   r"   r1   rf  rh  rl  r   r   r$   
LongTensorr%   r   boolr   r   r(   r   r   r   s   @r+   r[  r[  5  s            "? ? ?@ @ @C C C  156:594815155948.2,0/3&*t
 t
E,-t
 !!23t
 !!12	t

 u01t
 U-.t
 E-.t
   12t
 u01t
 'smt
 $D>t
 'tnt
 d^t
 
)51B+CC	Dt
 t
 t
 ^t
 t
 t
 t
 t
r*   r[  c                   $     e Zd Z fdZd Z xZS )r_  c                     t                                                       t          j        |j        |j                  | _        t          j                    | _        d S r  )r0   r1   r   r   r6   r  Tanh
activationr   s     r+   r1   zViltPooler.__init__  sC    Yv163EFF
'))r*   c                 r    |d d df         }|                      |          }|                     |          }|S rb   )r  r  )rC   r   first_token_tensorry  s       r+   r   zViltPooler.forward  s@     +111a40

#56666r*   r  r   s   @r+   r_  r_    sG        $ $ $ $ $
      r*   r_  zU
    ViLT Model with a language modeling head on top as done during pretraining.
    c                       e Zd ZddgZ fdZd Zd Ze	 	 	 	 	 	 	 	 	 	 	 	 ddee	j
                 dee	j                 d	ee	j
                 d
ee	j                 dee	j
                 dee	j                 dee	j                 dee	j                 dee	j
                 dee         dee         dee         deeee	j                 f         fd            Z xZS )ViltForMaskedLMzmlm_score.decoder.weightzmlm_score.decoder.biasc                     t                                          |           t          |          | _        t	          |          | _        |                                  d S r  )r0   r1   r[  rL  ViltMLMHead	mlm_scorera  r   s     r+   r1   zViltForMaskedLM.__init__  sQ       f%%	$V,, 	r*   c                     | j         j        S r  )r  decoderre  s    r+   get_output_embeddingsz%ViltForMaskedLM.get_output_embeddings  s    ~%%r*   c                 @    || j         _        |j        | j         _        d S r  )r  r  r   )rC   new_embeddingss     r+   set_output_embeddingsz%ViltForMaskedLM.set_output_embeddings  s    !/,1r*   Nr   r   r   r   r   r   r   r   labelsr   rC  rD  r  c                 V   ||n| j         j        }|                     |||||||||
||          }|dd         \  }}||j        d         n|j        d         }|ddd|f         |dd|df         }}|                     |          }d}|	et                      }|	                    |j                  }	 ||                    d| j         j	                  |	                    d                    }|s|f|dd         z   }||f|z   n|S t          |||j        |j                  S )a/  
        image_embeds (`torch.FloatTensor` of shape `(batch_size, num_patches, hidden_size)`, *optional*):
            Optionally, instead of passing `pixel_values`, you can choose to directly pass an embedded representation.
            This is useful if you want more control over how to convert `pixel_values` into patch embeddings.
        labels (*torch.LongTensor* of shape *(batch_size, sequence_length)*, *optional*):
            Labels for computing the masked language modeling loss. Indices should be in *[-100, 0, ...,
            config.vocab_size]* (see *input_ids* docstring) Tokens with indices set to *-100* are ignored (masked), the
            loss is only computed for the tokens with labels in *[0, ..., config.vocab_size]*

        Examples:

        ```python
        >>> from transformers import ViltProcessor, ViltForMaskedLM
        >>> import requests
        >>> from PIL import Image
        >>> import re
        >>> import torch

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)
        >>> text = "a bunch of [MASK] laying on a [MASK]."

        >>> processor = ViltProcessor.from_pretrained("dandelin/vilt-b32-mlm")
        >>> model = ViltForMaskedLM.from_pretrained("dandelin/vilt-b32-mlm")

        >>> # prepare inputs
        >>> encoding = processor(image, text, return_tensors="pt")

        >>> # forward pass
        >>> outputs = model(**encoding)

        >>> tl = len(re.findall("\[MASK\]", text))
        >>> inferred_token = [text]

        >>> # gradually fill in the MASK tokens, one by one
        >>> with torch.no_grad():
        ...     for i in range(tl):
        ...         encoded = processor.tokenizer(inferred_token)
        ...         input_ids = torch.tensor(encoded.input_ids)
        ...         encoded = encoded["input_ids"][0][1:-1]
        ...         outputs = model(input_ids=input_ids, pixel_values=encoding.pixel_values)
        ...         mlm_logits = outputs.logits[0]  # shape (seq_len, vocab_size)
        ...         # only take into account text features (minus CLS and SEP token)
        ...         mlm_logits = mlm_logits[1 : input_ids.shape[1] - 1, :]
        ...         mlm_values, mlm_ids = mlm_logits.softmax(dim=-1).max(dim=-1)
        ...         # only take into account text
        ...         mlm_values[torch.tensor(encoded) != 103] = 0
        ...         select = mlm_values.argmax().item()
        ...         encoded[select] = mlm_ids[select].item()
        ...         inferred_token = [processor.decode(encoded)]

        >>> selected_token = ""
        >>> encoded = processor.tokenizer(inferred_token)
        >>> output = processor.decode(encoded.input_ids[0], skip_special_tokens=True)
        >>> print(output)
        a bunch of cats laying on a couch.
        ```N
r   r   r   r   r   r   r   r   rC  rD  rG   r   r[   r   r   r   r   )rB   ro  rL  rq   r  r   r~   r_   rx   r   r   r   r   )rC   r   r   r   r   r   r   r   r   r  r   rC  rD  r   rx  ry  text_seq_lentext_featuresr   
mlm_logitsmasked_lm_lossloss_fctr  s                          r+   r   zViltForMaskedLM.forward  s   R &1%<kk$+B]))))%!'%/!5#  
 
 *1!&-6-Byq))H[\]H^+AAA}},<=qqqR^R_R_O_?`q^^M22
'))HYYz011F%Xjoob$+:P&Q&QSYS^S^_aSbSbccN 	Z ]WQRR[0F3A3M^%..SYY!/)	
 
 
 	
r*   rz  )r    r!   r"   _tied_weights_keysr1   r  r  r   r   r$   r{  r%   r|  r   r   r(   r   r   r   s   @r+   r  r    s        56NO    & & &2 2 2  156:594815155948-1,0/3&*o
 o
E,-o
 !!23o
 !!12	o

 u01o
 U-.o
 E-.o
   12o
 u01o
 )*o
 $D>o
 'tno
 d^o
 
~uU%677	8o
 o
 o
 ^o
 o
 o
 o
 o
r*   r  c                   $     e Zd Z fdZd Z xZS )ViltPredictionHeadTransformc                 V   t                                                       t          j        |j        |j                  | _        t          |j        t                    rt          |j                 | _
        n|j        | _
        t          j        |j        |j                  | _        d S )Nr   )r0   r1   r   r   r6   r  r   r!  r"  r	   transform_act_fnr   r   r   s     r+   r1   z$ViltPredictionHeadTransform.__init__i  s    Yv163EFF
f'-- 	6$*6+<$=D!!$*$5D!f&8f>STTTr*   c                     |                      |          }|                     |          }|                     |          }|S r  )r  r  r   r%  s     r+   r   z#ViltPredictionHeadTransform.forwardr  s=    

=11--m<<}55r*   r  r   s   @r+   r  r  h  sL        U U U U U      r*   r  c                   ,     e Zd Zd fd	Zd Zd Z xZS )r  Nc                 h   t                                                       || _        t          |          | _        t          j        |j        |j        d          | _	        t          j
        t          j        |j                            | _        ||| j	        _        | j        | j	        _        d S )NFr   )r0   r1   rB   r  	transformr   r   r6   r   r  r4   r$   r5   r   rp   )rC   rB   rp   rD   s      r+   r1   zViltMLMHead.__init__z  s    4V<<y!3V5FUSSSLV->!?!?@@	"(DL !Ir*   c                 (    | j         | j        _         d S r  )r   r  re  s    r+   _tie_weightszViltMLMHead._tie_weights  s     Ir*   c                 Z    |                      |          }|                     |          }|S r  )r  r  )rC   r   s     r+   r   zViltMLMHead.forward  s'    NN1LLOOr*   r  )r    r!   r"   r1   r  r   r   r   s   @r+   r  r  y  s[        
& 
& 
& 
& 
& 
&& & &      r*   r  z
    Vilt Model transformer with a classifier head on top (a linear layer on top of the final hidden state of the [CLS]
    token) for visual question answering, e.g. for VQAv2.
    c                       e Zd Z fdZe	 	 	 	 	 	 	 	 	 	 	 	 ddeej                 deej                 deej                 deej                 deej                 deej                 d	eej                 d
eej                 deej                 dee	         dee	         dee	         de
eeej                 f         fd            Z xZS )ViltForQuestionAnsweringc           	         t                                          |           |j        | _        t          |          | _        t          j        t          j        |j        |j        dz            t          j	        |j        dz            t          j
                    t          j        |j        dz  |j                            | _        |                                  d S )NrG   )r0   r1   
num_labelsr[  rL  r   
Sequentialr   r6   r   GELU
classifierra  r   s     r+   r1   z!ViltForQuestionAnswering.__init__  s        +f%%	 -If(&*<q*@AAL+a/00GIIIf(1,f.?@@	
 
 	r*   Nr   r   r   r   r   r   r   r   r  r   rC  rD  r  c                    ||n| j         j        }|                     |||||||||
||          }|r|j        n|d         }|                     |          }d}|	H|	                    |j                  }	t          j        	                    ||	          |	j
        d         z  }|s|f|dd         z   }||f|z   n|S t          |||j        |j                  S )aX  
        image_embeds (`torch.FloatTensor` of shape `(batch_size, num_patches, hidden_size)`, *optional*):
            Optionally, instead of passing `pixel_values`, you can choose to directly pass an embedded representation.
            This is useful if you want more control over how to convert `pixel_values` into patch embeddings.
        labels (`torch.FloatTensor` of shape `(batch_size, num_labels)`, *optional*):
            Labels for computing the visual question answering loss. This tensor must be either a one-hot encoding of
            all answers that are applicable for a given example in the batch, or a soft encoding indicating which
            answers are applicable, where 1.0 is the highest score.

        Examples:

        ```python
        >>> from transformers import ViltProcessor, ViltForQuestionAnswering
        >>> import requests
        >>> from PIL import Image

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)
        >>> text = "How many cats are there?"

        >>> processor = ViltProcessor.from_pretrained("dandelin/vilt-b32-finetuned-vqa")
        >>> model = ViltForQuestionAnswering.from_pretrained("dandelin/vilt-b32-finetuned-vqa")

        >>> # prepare inputs
        >>> encoding = processor(image, text, return_tensors="pt")

        >>> # forward pass
        >>> outputs = model(**encoding)
        >>> logits = outputs.logits
        >>> idx = logits.argmax(-1).item()
        >>> print("Predicted answer:", model.config.id2label[idx])
        Predicted answer: 2
        ```Nr  r   rG   r  )rB   ro  rL  rn  r  r~   r_   r   rP    binary_cross_entropy_with_logitsrq   r   r   r   )rC   r   r   r   r   r   r   r   r   r  r   rC  rD  r   rn  r   r   r  s                     r+   r   z ViltForQuestionAnswering.forward  s   b &1%<kk$+B]))))%!'%/!5#  
 
 2=L--'!*//YYv}--F=AA&&QQTZT`abTccD  	FY,F)-)9TGf$$vE'!/)	
 
 
 	
r*   rz  r    r!   r"   r1   r   r   r$   r{  r%   r|  r   r   r(   r   r   r   s   @r+   r  r    s           "  156:594815155948-1,0/3&*T
 T
E,-T
 !!23T
 !!12	T

 u01T
 U-.T
 E-.T
   12T
 u01T
 )*T
 $D>T
 'tnT
 d^T
 
'u/@)AA	BT
 T
 T
 ^T
 T
 T
 T
 T
r*   r  z
    Vilt Model transformer with a classifier head on top (a linear layer on top of the final hidden state of the [CLS]
    token) for image-to-text or text-to-image retrieval, e.g. MSCOCO and F30K.
    c                       e Zd Z fdZe	 	 	 	 	 	 	 	 	 	 	 	 ddeej                 deej                 deej                 deej                 deej                 deej                 d	eej                 d
eej                 deej                 dee	         dee	         dee	         de
eeej                 f         fd            Z xZS )ViltForImageAndTextRetrievalc                     t                                          |           t          |          | _        t	          j        |j        d          | _        |                                  d S r/   )	r0   r1   r[  rL  r   r   r6   rank_outputra  r   s     r+   r1   z%ViltForImageAndTextRetrieval.__init__  s[       f%%	 9V%7;; 	r*   Nr   r   r   r   r   r   r   r   r  r   rC  rD  r  c                 B   ||n| j         j        }d}|	t          d          |                     |||||||||
||          }|r|j        n|d         }|                     |          }|s|f|dd         z   }||f|z   n|S t          |||j        |j                  S )ad  
        image_embeds (`torch.FloatTensor` of shape `(batch_size, num_patches, hidden_size)`, *optional*):
            Optionally, instead of passing `pixel_values`, you can choose to directly pass an embedded representation.
            This is useful if you want more control over how to convert `pixel_values` into patch embeddings.
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels are currently not supported.

        Examples:

        ```python
        >>> from transformers import ViltProcessor, ViltForImageAndTextRetrieval
        >>> import requests
        >>> from PIL import Image

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)
        >>> texts = ["An image of two cats chilling on a couch", "A football player scoring a goal"]

        >>> processor = ViltProcessor.from_pretrained("dandelin/vilt-b32-finetuned-coco")
        >>> model = ViltForImageAndTextRetrieval.from_pretrained("dandelin/vilt-b32-finetuned-coco")

        >>> # forward pass
        >>> scores = dict()
        >>> for text in texts:
        ...     # prepare inputs
        ...     encoding = processor(image, text, return_tensors="pt")
        ...     outputs = model(**encoding)
        ...     scores[text] = outputs.logits[0, :].item()
        ```NzTraining is not yet supported.r  r   rG   r  )	rB   ro  NotImplementedErrorrL  rn  r  r   r   r   )rC   r   r   r   r   r   r   r   r   r  r   rC  rD  r   r   rn  r   r  s                     r+   r   z$ViltForImageAndTextRetrieval.forward  s    Z &1%<kk$+B]%&FGGG))))%!'%/!5#  
 
 2=L--'!*!!-00 	FY,F)-)9TGf$$vE'!/)	
 
 
 	
r*   rz  r  r   s   @r+   r  r    s       	 	 	 	 	  156:594815155948-1,0/3&*M
 M
E,-M
 !!23M
 !!12	M

 u01M
 U-.M
 E-.M
   12M
 u01M
 )*M
 $D>M
 'tnM
 d^M
 
'u/@)AA	BM
 M
 M
 ^M
 M
 M
 M
 M
r*   r  zq
    Vilt Model transformer with a classifier head on top for natural language visual reasoning, e.g. NLVR2.
    c                       e Zd Z fdZe	 	 	 	 	 	 	 	 	 	 	 	 ddeej                 deej                 deej                 deej                 deej                 deej                 d	eej                 d
eej                 deej                 dee	         dee	         dee	         de
eeej                 f         fd            Z xZS )"ViltForImagesAndTextClassificationc           	         t                                          |           |j        | _        t          |          | _        |j        }t          j        t          j        |j	        |z  |j	        |z            t          j
        |j	        |z            t          j                    t          j        |j	        |z  |j                            | _        |                                  d S r  )r0   r1   r  r[  rL  
num_imagesr   r  r   r6   r   r  r  ra  )rC   rB   r  rD   s      r+   r1   z+ViltForImagesAndTextClassification.__init__h  s        +f%%	 &
-If(:5v7IJ7VWWL+j899GIIIf(:5v7HII	
 
 	r*   Nr   r   r   r   r   r   r   r   r  r   rC  rD  r  c                    |
|
n| j         j        }
||n| j         j        }||n| j         j        }| |j        dk    r|                    d          }| |j        dk    r|                    d          }||j        d         nd}|||j        d         nd}|| j         j        k    rt          d          g }|rg nd}|
rg nd}t          |          D ]}| 
                    |||||dd|ddddddf         nd||dd|ddddf         nd||||dd|ddddf         nd|dz   |
||          }|r|j        n|d         }|                    |           |r|                    |j                   |
r|                    |j                   t          j        |d          }|                     |          }d}|	`t%                      }|	                    |j                  }	 ||                    d| j                  |	                    d                    }|s|||f}||f|z   n|S t/          ||||	          S )
a3  
        image_embeds (`torch.FloatTensor` of shape `(batch_size, num_patches, hidden_size)`, *optional*):
            Optionally, instead of passing `pixel_values`, you can choose to directly pass an embedded representation.
            This is useful if you want more control over how to convert `pixel_values` into patch embeddings.
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Binary classification labels.

        Examples:

        ```python
        >>> from transformers import ViltProcessor, ViltForImagesAndTextClassification
        >>> import requests
        >>> from PIL import Image

        >>> image1 = Image.open(requests.get("https://lil.nlp.cornell.edu/nlvr/exs/ex0_0.jpg", stream=True).raw)
        >>> image2 = Image.open(requests.get("https://lil.nlp.cornell.edu/nlvr/exs/ex0_1.jpg", stream=True).raw)
        >>> text = "The left image contains twice the number of dogs as the right image."

        >>> processor = ViltProcessor.from_pretrained("dandelin/vilt-b32-finetuned-nlvr2")
        >>> model = ViltForImagesAndTextClassification.from_pretrained("dandelin/vilt-b32-finetuned-nlvr2")

        >>> # prepare inputs
        >>> encoding = processor([image1, image2], text, return_tensors="pt")

        >>> # forward pass
        >>> outputs = model(input_ids=encoding.input_ids, pixel_values=encoding.pixel_values.unsqueeze(0))
        >>> logits = outputs.logits
        >>> idx = logits.argmax(-1).item()
        >>> print("Predicted answer:", model.config.id2label[idx])
        Predicted answer: True
        ```N   r   r   z\Make sure to match the number of images in the model with the number of images in the input.)r   r   r   r   r   r   r   r   r   rC  rD  r[   rJ   r  )rB   r   rC  ro  ndim	unsqueezerq   r  r   r;  rL  rn  r   r   r   r$   ry   r  r   r~   r_   rx   r  r   )rC   r   r   r   r   r   r   r   r   r  r   rC  rD  r  pooler_outputsr   r   r   r   rn  ry  r   r   r  r  s                            r+   r   z*ViltForImagesAndTextClassification.forwardz  s   ^ 2C1N--TXT_Tq$8$D  $+Jj 	 &1%<kk$+B]#(9Q(>(>'11!44L#(9Q(>(>'11!44L.:.F\'**D
2>2J+A..PTJ///n   2<,6RR$
z"" 	6 	6Aii--<H<T\!!!Q111aaa-88Z^5?5K:aaaAAAqqqj11QU#+9E9Q\!!!Q111*55W[%&U"3%9'    G 6APG11gajM!!-000# <$$W%:;;;  6!!'"4555	.b999//'))HYYv}--F8FKKDO<<fkk"ooNND 	FmZ8F)-)9TGf$$vE7'!	
 
 
 	
r*   rz  )r    r!   r"   r1   r   r   r$   r{  r%   r|  r   r   r(   r   r   r   s   @r+   r  r  b  s           $  156:594815155948-1,0/3&*p
 p
E,-p
 !!23p
 !!12	p

 u01p
 U-.p
 E-.p
   12p
 u01p
 )*p
 $D>p
 'tnp
 d^p
 
7u?P9QQ	Rp
 p
 p
 ^p
 p
 p
 p
 p
r*   r  c                       e Zd Z fdZe	 	 	 	 	 	 	 	 	 	 	 	 ddeej                 deej                 deej                 deej                 deej                 deej                 d	eej                 d
eej                 deej                 dee	         dee	         dee	         de
eeej                 f         fd            Z xZS )ViltForTokenClassificationc                 :   t                                          |           |j        | _        t          |d          | _        t          j        |j                  | _        t          j	        |j
        |j                  | _        |                                  d S )NF)rb  )r0   r1   r  r[  rL  r   r?   r@   rA   r   r6   r  ra  r   s     r+   r1   z#ViltForTokenClassification.__init__  s~        +f>>>	z&"<==)F$68IJJ 	r*   Nr   r   r   r   r   r   r   r   r  r   rC  rD  r  c                 L   ||n| j         j        }|                     |||||||||
||          }|d         }||j        d         n|j        d         }|                     |          }|                     |ddd|f                   }d}|	`t                      }|	                    |j                  }	 ||	                    d| j
                  |		                    d                    }|s|f|dd         z   }||f|z   n|S t          |||j        |j                  S )a/  
        image_embeds (`torch.FloatTensor` of shape `(batch_size, num_patches, hidden_size)`, *optional*):
            Optionally, instead of passing `pixel_values`, you can choose to directly pass an embedded representation.
            This is useful if you want more control over how to convert `pixel_values` into patch embeddings.
        labels (`torch.LongTensor` of shape `(batch_size, text_sequence_length)`, *optional*):
            Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
        Nr  r   r   r[   rG   r  )rB   ro  rL  rq   rA   r  r   r~   r_   rx   r  r   r   r   )rC   r   r   r   r   r   r   r   r   r  r   rC  rD  r   rx  text_input_sizer   r   r  r  s                       r+   r   z"ViltForTokenClassification.forward  sa   0 &1%<kk$+B]))))%!'%/!5#  
 
 "!*090E)/!,,=K^_`Ka,,774D_4D1D!EFF'))HYYv}--F8FKKDO<<fkk"ooNND 	FY,F)-)9TGf$$vE$!/)	
 
 
 	
r*   rz  )r    r!   r"   r1   r   r   r$   r{  r%   r|  r   r   r(   r   r   r   s   @r+   r  r    sj       
 
 
 
 
  156:594815155948-1,0/3&*>
 >
E,->
 !!23>
 !!12	>

 u01>
 U-.>
 E-.>
   12>
 u01>
 )*>
 $D>>
 'tn>
 d^>
 
$eE,=&>>	?>
 >
 >
 ^>
 >
 >
 >
 >
r*   r  )r  r  r  r  r  r,  r[  rK  )=r#   collections.abcr   r   dataclassesr   typingr   r   r$   r   torch.nnr   activationsr	   modeling_layersr
   modeling_outputsr   r   r   r   r   r   modeling_utilsr   pytorch_utilsr   r   r   utilsr   r   configuration_viltr   
get_loggerr    loggerr   Moduler-   r2   r8   r   r  r  r  r(  r,  r7  rK  r[  r_  r  r  r  r  r  r  r  __all__r)   r*   r+   <module>r     s6          ! ! ! ! ! ! " " " " " " " "        % % % % % % ! ! ! ! ! ! 9 9 9 9 9 9                . - - - - - [ [ [ [ [ [ [ [ [ [ , , , , , , , , * * * * * * 
	H	%	%   
@ @ @ @ @{ @ @  @$W! W! W! W! W!RY W! W! W!t6 6 6 6 6RY 6 6 6r    ")   >? ? ? ? ?	 ? ? ?F    RY   "    BI   F    ry    
 
 
 
 
 
 
 
# # # # #* # # #L)
 )
 )
 )
 )
") )
 )
 )
X * * * * */ * * *. U
 U
 U
 U
 U
# U
 U
 U
p          
C
 C
 C
 C
 C
) C
 C
 
C
L    ")   "    ")   ,   g
 g
 g
 g
 g
2 g
 g
 g
T   Z
 Z
 Z
 Z
 Z
#6 Z
 Z
 Z
z   
D
 D
 D
 D
 D
)< D
 D
 
D
N L
 L
 L
 L
 L
!4 L
 L
 L
^	 	 	r*   