
     `i`                        d Z ddlZddlmZ ddlmZ ddlZddlmZ ddlm	Z	 ddl
mZ dd	lmZmZmZ dd
lmZ ddlmZ ddlmZmZ ddlmZ ddlmZ  ej        e          Zee G d de                                  Z G d dej                  Z  G d dej                  Z! G d dej                  Z" G d dej                  Z# G d dej                  Z$ G d dej                  Z% G d dej                  Z& G d  d!e          Z' G d" d#ej                  Z( G d$ d%ej                  Z)e G d& d'e                      Z* G d( d)ej                  Z+ G d* d+ej                  Z,e+e,d,Z- ed-.           G d/ d0e*                      Z. G d1 d2ej                  Z/ ed3.           G d4 d5e*                      Z0g d6Z1dS )7zPyTorch TVP Model    N)	dataclass)Optional)nn   )ACT2FN)GradientCheckpointingLayer)BaseModelOutputBaseModelOutputWithPoolingModelOutput)PreTrainedModel)prune_linear_layer)auto_docstringlogging)load_backbone   )	TvpConfigc                       e Zd ZU dZdZeej                 ed<   dZ	eej                 ed<   dZ
eeej        df                  ed<   dZeeej        df                  ed<   dS )TvpVideoGroundingOutputa  
    loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `return_loss` is `True`):
        Temporal-Distance IoU loss for video grounding.
    logits (`torch.FloatTensor` of shape `(batch_size, 2)`):
        Contains start_time/duration and end_time/duration. It is the time slot of the videos corresponding to the
        input texts.
    attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
        Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
        sequence_length)`.
    Nlosslogits.hidden_states
attentions)__name__
__module____qualname____doc__r   r   torchFloatTensor__annotations__r   r   tupler        x/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/transformers/models/tvp/modeling_tvp.pyr   r   %   s         	 	 )-D(5$
%,,,*.FHU&'...=AM8E%"3S"89:AAA:>Ju0#567>>>>>r"   r   c                   :     e Zd ZdZ fdZd Zd Zd Zd Z xZ	S )TvpLossa~  
    This class computes the losses for `TvpForVideoGrounding`. The process happens in two steps: 1) we compute
    hungarian assignment between ground truth boxes and the outputs of the model 2) we supervise each pair of matched
    ground-truth / prediction (supervise class and box).

    Args:
        losses (`list[str]`):
            List of all the losses to be applied.
    c                     t                                                       | j        | j        | j        d| _        |D ]}|| j        vrt          d| d          || _        d S )NioudistancedurationzLoss z not supported)super__init__loss_iouloss_distanceloss_durationloss_map
ValueErrorlosses)selfr2   r   	__class__s      r#   r,   zTvpLoss.__init__D   s    =**
 

  	? 	?D4=(( !=!=!=!=>>> ) r"   c                     t          j        ||          t          j        ||          z
  }t          j        ||          t          j        ||          z
  }d|                    d          |z  z
  }|S )z6
        Measure the intersection over union.
        r   r   min)r   r7   maxclamp)	r3   
start_timeend_timecandidates_start_timecandidates_end_timer*   interunionr(   s	            r#   r-   zTvpLoss.loss_iouQ   sp     	-x8859EZ\f;g;gg	-x8859EZ\f;g;gg%++!+$$u,,
r"   c                 J   t          j        t          j        ||          d          }t          j        t          j        ||          d          }t          j        t          j        ||          t          j        ||          z
  |                              d          }|S )z5
        Measure the distance of mid points.
        g       @g?r6   )r   divaddr8   r7   r9   )	r3   r:   r;   r<   r=   r*   mid_candidatesmid_groundtruthdistance_diffs	            r#   r.   zTvpLoss.loss_distance[   s     59-BDW#X#XZ]^^)EIj($C$CSII	Ino66>Sb9c9ccem
 

%C%.. 	 r"   c                     t          j        ||          }t          j        ||          }t          j        t          j        t          j        ||          |                    }|                    d          }|S )z5
        Measure the difference of duration.
        g?r6   )r   subsquarerA   r9   )	r3   r:   r;   r<   r=   r*   duration_candidatesduration_groundtruthduration_diffs	            r#   r/   zTvpLoss.loss_durationg   sp     $i(;=RSS$y:>>UYuy9LNb/c/cem%n%noo%+++44r"   c                 *   |\  }}}t          j        ||          }|dddf                                         |dddf                                         }}i }	| j        D ]1}
|	                    |
 | j        |
         |||||          i           2|	S )am  
        This performs the loss computation.

        Args:
            logits (`torch.FloatTensor`):
                The output logits of head module.
            labels (`list[torch.FloatTensor]`):
                List of tensors ([start, end, duration]), which contains start time, end time of the video corresponding to the text, and also the duration.
        Nr   r   )r   mulfloatr2   updater0   )r3   r   labelsr*   r:   r;   
candidatesr<   r=   losses_dictr   s              r#   forwardzTvpLoss.forwardr   s     *0&*hYvx00
5?15E5K5K5M5MzZ[Z[Z[]^Z^O_OeOeOgOg2K 	 	D*t}T*:xAVXkmuvvw    r"   )
r   r   r   r   r,   r-   r.   r/   rS   __classcell__r4   s   @r#   r%   r%   9   s~               
 
 
	 	 	      r"   r%   c                   $     e Zd Z fdZd Z xZS )TvpVisionModelc           	         t                                                       t          |          | _        |j        |j        j        d         }nt          | j        d          r2t          | j        j        d          r| j        j        j        d         }nPt          | j        d          r,t          | j        j        d          r| j        j        j        }nt          d          t          j        ||j        ddddd	          | _        d S )
Nconfighidden_sizeshidden_sizezBackbone config not foundr   r   F)kernel_sizestridepaddinggroupsbias)r+   r,   r   backbonebackbone_configr[   hasattrrZ   r\   r1   r   Conv2dgrid_encoder_conv)r3   rZ   in_channelsr4   s      r#   r,   zTvpVisionModel.__init__   s   %f--!- 0=bAKKT]H-- 	:'$-:NP^2_2_ 	:-.;B?KKT]H-- 	:'$-:NP]2^2^ 	:-.:KK8999!#"
 "
 "
r"   c                    |j         \  }}}}}|                    ||z  |||          }|                     |          d         d         }|                     |          }t          j                            |dd          }t          j                            |d          }|j         dd          \  }	}
}|                    |||	|
|          }|                    ddd	d
d          }|S )Nfeature_mapsr      )r]   r^   T)inplacer   r      )	shapeviewrb   rf   r   
functional
max_pool2drelupermute)r3   pixel_values
batch_size
num_framesnum_channelsheightwidthgrid_feat_outputsgridnew_channel
new_height	new_widths               r#   rS   zTvpVisionModel.forward   s    >J>P;
Jfe#((j)@,PVX]^^ MM,77GJ%%&788}''!A'FF}!!$!55-1Z_*ZyyZj)TT||Aq!Q**r"   r   r   r   r,   rS   rT   rU   s   @r#   rW   rW      sG        
 
 
 
 
.      r"   rW   c                   j     e Zd ZdZ fdZdej        dededej        fdZdd	e	fd
Z
dd	e	fdZ xZS )TvpVisualInputEmbeddingz;
    Takes input of both image and video (multi-frame)
    c                    t                                                       t          j        |j        |j                  | _        t          j        |j        |j                  | _        t          j        |j	        |j                  | _
        t          j        d|j                  | _        t          j        |j        |j                  | _        t          j        |j                  | _        |j        | _        |j	        | _	        d S )Nr   eps)r+   r,   r   	Embeddingmax_position_embeddingsr\   position_embeddings max_grid_row_position_embeddingsrow_position_embeddings max_grid_col_position_embeddingscol_position_embeddingstoken_type_embeddings	LayerNormlayer_norm_eps
layer_normDropouthidden_dropout_probdropoutr3   rZ   r4   s     r#   r,   z TvpVisualInputEmbedding.__init__   s    #%<0NPVPb#c#c ')|F4[]c]o'p'p$')|F4[]c]o'p'p$%'\!V5G%H%H",v'9v?TUUUz&"<==060W-060W---r"   	embeddingrx   ry   returnc                    dx}}|| j         k    r
|| j         z  }|| j        k    r
|| j        z  }|                    dddd          }t          j                            |||fdd          }|                    dddd          }|S )z
        This method allows to interpolate the pre-trained pad weights , to be able to use the model on collection of high
        resolution images (high resolution videos).

        r   r   r   rj   bicubicFscale_factormodealign_corners)r   r   rs   r   rp   interpolate)r3   r   rx   ry   h0w0s         r#   interpolate_pos_encodingz0TvpVisualInputEmbedding.interpolate_pos_encoding   s     RD999$??B4888>>B%%aAq11	M--b	 . 
 
	 %%aAq11	r"   Fr   c                 L   |j         \  }}}}t          | j        |          }t          j        |t          j        |j                  }|                     |          }	dt          |j                   dz
  z  |d|fz   }
 |	j	        |
 }	t          | j
        |          }t          j        |t          j        |j                  }|                     |          }|d||f} |j	        | }|	|z   }|r1|| j        k    s|| j
        k    r||                     |||          z   }n||z   }|S )af  
        Args:
            grid: (batch_size, height, width, hidden_dim)
            interpolate_pos_encoding: (`bool`, *optional*, defaults to `False`):
                Whether to interpolate the pre-trained position encodings.
        Returns:
            grid + col_position_embeddings.view(*col_shape): (batch_size, *, height, width, hidden_dim)
        dtypedevice)r   r   r   )rn   r7   r   r   arangelongr   r   lenro   r   r   r   )r3   r{   r   ru   rx   ry   
hidden_dim
row_heightrow_position_idsr   	row_shape	row_widthcol_position_idsr   	col_shapepositional_embeddingss                   r#   add_2d_positional_embeddingsz4TvpVisualInputEmbedding.add_2d_positional_embeddings   sP    15
-
FE: >GG
 <
%*T[YYY"&">">?O"P"PC
OOa/0J:3NN	">"9">	"J =uEE	 <	DKXXX"&">">?O"P"PIz:	">"9">	"J 7:Q Q $ 	0T:::edFk>k>k$778MvW\]]]DD//Dr"   c                    |j         \  }}}}}|                    d          }|                     ||          }|                    |d|          }|j         dd         }	|j        }
t          j        |	t
          j        |
          }|                     |          }||z   }| 	                    |          }| 
                    |          }|S )a  
        Args:
            grid: Array of shape (batch_size, num_frames, height, width, num_channels).
                It contains processed frames extracted from videos, and is generated by Tvp image preprocessor. Note,
                num_frames can be 1
            interpolate_pos_encoding: (bool, *optional*, defaults to `False`):
                Whether to interpolate the pre-trained position encodings.

        Returns:
            embeddings: The embedding of grid with size (batch_size, height*width, num_channels)

        r   r   rY   Nr   )rn   meanr   ro   r   r   zerosr   r   r   r   )r3   r{   r   ru   rv   rx   ry   rw   visual_tokensvisual_tokens_shaper   token_type_idsr   
embeddingss                 r#   rS   zTvpVisualInputEmbedding.forward  s     ?Cj;
J|yy||00Ph0ii		*b,??+1#2#6% %8
SYZZZ $ : :> J J"%::
__Z00
\\*--
r"   F)r   r   r   r   r,   r   Tensorintr   boolr   rS   rT   rU   s   @r#   r   r      s         
X 
X 
X 
X 
X%,  TW \a\h    .' '4 ' ' ' 'R d        r"   r   c                   *     e Zd ZdZ fdZddZ xZS )TvpTextInputEmbeddingszGConstruct the embeddings from word, position and token_type embeddings.c                    t                                                       t          j        |j        |j        |j                  | _        t          j        |j        |j                  | _	        t          j        |j
        |j                  | _        t          j        |j        |j                  | _        t          j        |j                  | _        d S )N)padding_idxr   )r+   r,   r   r   
vocab_sizer\   pad_token_idword_embeddingsr   r   type_vocab_sizer   r   r   r   r   r   r   r   s     r#   r,   zTvpTextInputEmbeddings.__init__%  s    !|F,=v?Q_e_rsss#%<0NPVPb#c#c %'\&2H&J\%]%]",v'9v?TUUUz&"<==r"   Nc                 ^   ||                                 }n|                                 d d         }|d         }||j        n|j        }|It          j        |t          j        |          }|                    d                              |          }|!t          j        |t          j        |          }||                     |          }| 	                    |          }| 
                    |          }	||z   |	z   }
|                     |
          }
|                     |
          }
|
S )NrY   r   r   r   )sizer   r   r   r   	unsqueezeexpandr   r   r   r   r   r   )r3   	input_idsr   position_idsinputs_embedsinput_shape
seq_lengthr   r   r   r   s              r#   rS   zTvpTextInputEmbeddings.forward-  s(    #..**KK',,..ss3K ^
%.%:!!@T <
%*VTTTL'11!44;;KHHL!"[EJvVVVN  00;;M"66|DD $ : :> J J"%88;PP
__Z00
\\*--
r"   )NNNNr   r   r   r   r,   rS   rT   rU   s   @r#   r   r   "  sR        QQ> > > > >       r"   r   c                   b     e Zd Z fdZd Zdej        dedefdZ	 	 	 d
de	e
         fd	Z xZS )TvpAttentionc                 V   t                                                       |j        |j        z  dk    r/t	          |d          st          d|j         d|j                   |j        | _        t          |j        |j        z            | _        | j        | j        z  | _        t          j
        |j        | j                  | _        t          j
        |j        | j                  | _        t          j
        |j        | j                  | _        t          j        |j                  | _        t          j
        |j        |j                  | _        t          j        |j        |j                  | _        t          j        |j                  | _        t/                      | _        d S )Nr   embedding_sizezThe hidden size z4 is not a multiple of the number of attention heads r   )r+   r,   r\   num_attention_headsrd   r1   r   attention_head_sizeall_head_sizer   Linearquerykeyvaluer   attention_probs_dropout_probattn_dropoutdenser   r   r   r   r   setpruned_headsr   s     r#   r,   zTvpAttention.__init__G  su    ::a??PVXhHiHi? H6#5  H  Hkq  lF  H  H   $*#= #&v'9F<V'V#W#W !58PPYv143EFF
9V/1CDDYv143EFF
Jv'JKKYv163EFF
,v'9v?TUUUz&"<==EEr"   c                 P   t          |          dk    rd S t          j        | j        | j                  }t          |          | j        z
  }|D ]*t          fd| j        D                       z
  d|<   +|                    d          	                                
                    d          }t          j        t          |                    |                                         }t          | j        |          | _        t          | j        |          | _        t          | j        |          | _        t          | j        |d          | _        | j        t          |          z
  | _        | j        | j        z  | _        | j                            |          | _        d S )Nr   c              3   ,   K   | ]}|k     rd ndV  dS )r   r   Nr!   ).0hheads     r#   	<genexpr>z+TvpAttention.prune_heads.<locals>.<genexpr>c  s/      NNq1t88aaNNNNNNr"   rY   r   dim)r   r   onesr   r   r   r   sumro   
contiguouseqr   r   r   r   r   r   r   r   r?   )r3   headsmaskindexr   s       @r#   prune_headszTvpAttention.prune_heads\  sq   u::??Fz$2D4LMME

T.. 	 	D#NNNND<MNNNNNNDDJJyy}}'')),,Q//SYY''-2244 (
E::
%dh66'
E::
'
EqAAA
 $(#;c%jj#H !58PP -33E::r"   tensorsequence_lengthru   c                     |                     ||| j        | j                                      dd                                          S )Nr   rj   )ro   r   r   	transposer   )r3   r   r   ru   s       r#   _reshapezTvpAttention._reshapes  s8    KK
OT5MtOghhYq!__Z\\	
r"   Noutput_attentionsc                    |j         d d         \  }}|                     |          }|                     |          }|                     |          }	|                     |||          }
|                     |||          }|                     |	||          }t          j        |
|                    dd                    }|t          j	        | j
                  z  }|||z   }t          j                            |d          }|                     |          }|||z  }t          j        ||          }|                    dd                                          }|                    ||| j                  }|                     |          }|                     |          }|                     ||z             }|r||fn|f}|S )Nrj   rY   r   r   )rn   r   r   r   r   r   matmulr   mathsqrtr   r   rp   softmaxr   r   reshaper   r   r   r   )r3   r   attention_mask	head_maskr   ru   r   mixed_query_layermixed_key_layermixed_value_layerquery_layer	key_layervalue_layerattention_scoresattention_probsattn_outputoutputss                    r#   rS   zTvpAttention.forwardz  s    '4&9"1"&=#
O JJ}55((=11 JJ}55mm$5
SSMM/?JOO	mm$5
SS !<Y5H5HR5P5PQQ+di8P.Q.QQ%/.@ -//0@b/II ++O<<  -	9Ol?K@@!++Aq11<<>>!))*otGYZZjj--ll;//ookM&ABB4EY;00K>r"   NNN)r   r   r   r,   r   r   r   r   r   r   r   rS   rT   rU   s   @r#   r   r   F  s        " " " " "*; ; ;.
u| 
c 
s 
 
 
 
 ,0+ +
 $D>+ + + + + + + +r"   r   c                   B     e Zd Z fdZdej        dej        fdZ xZS )TvpIntermediatec                    t                                                       t          j        |j        |j                  | _        t          |j        t                    rt          |j                 | _        d S |j        | _        d S N)r+   r,   r   r   r\   intermediate_sizer   
isinstance
hidden_actstrr   intermediate_act_fnr   s     r#   r,   zTvpIntermediate.__init__  sn    Yv163KLL
f'-- 	9'-f.?'@D$$$'-'8D$$$r"   r   r   c                 Z    |                      |          }|                     |          }|S r  )r   r  )r3   r   s     r#   rS   zTvpIntermediate.forward  s,    

=1100??r"   r   r   r   r,   r   r   rS   rT   rU   s   @r#   r	  r	    s^        9 9 9 9 9U\ el        r"   r	  c                   P     e Zd Z fdZdej        dej        dej        fdZ xZS )TvpOutputLayerc                    t                                                       t          j        |j        |j                  | _        t          j        |j        |j                  | _	        t          j
        |j                  | _        d S )Nr   )r+   r,   r   r   r  r\   r   r   r   r   r   r   r   r   s     r#   r,   zTvpOutputLayer.__init__  sf    Yv79KLL
,v'9v?TUUUz&"<==r"   r   input_tensorr   c                     |                      |          }|                     |          }|                     ||z             }|S r  )r   r   r   )r3   r   r  s      r#   rS   zTvpOutputLayer.forward  s@    

=11]33(DEEr"   r  rU   s   @r#   r  r    si        > > > > >U\  RWR^        r"   r  c                   >     e Zd Z fdZ	 	 	 ddee         fdZ xZS )TvpEncodeLayerc                     t                                                       t          |          | _        t	          |          | _        t          |          | _        d S r  )r+   r,   r   	attentionr	  intermediater  outputr   s     r#   r,   zTvpEncodeLayer.__init__  sK    %f--+F33$V,,r"   Nr   c                     |                      ||||          }|d         }|dd          }|                     |          }|                     ||          }	|	f|z   }|S )N)r   r   r   )r  r  r  )
r3   r   r   r   r   self_attention_outputsattention_outputr  intermediate_outputlayer_outputs
             r#   rS   zTvpEncodeLayer.forward  s     "&/	 "0 "
 "
 2!4(,"//0@AA{{#68HII/G+r"   r  )r   r   r   r,   r   r   rS   rT   rU   s   @r#   r  r    sj        - - - - - ,0 
 $D>       r"   r  c            
       |     e Zd Z fdZ	 	 	 	 	 ddeej                 dee         dee         dee         fdZ xZ	S )	
TvpEncoderc                     t                                                       | _        t          j        fdt          j                  D                       | _        d| _        d S )Nc                 .    g | ]}t                    S r!   )r  )r   _rZ   s     r#   
<listcomp>z'TvpEncoder.__init__.<locals>.<listcomp>  s!    #d#d#dqN6$:$:#d#d#dr"   F)	r+   r,   rZ   r   
ModuleListrangenum_hidden_layerslayergradient_checkpointingr   s    `r#   r,   zTvpEncoder.__init__  s`    ]#d#d#d#dE&JbDcDc#d#d#dee
&+###r"   Nr   r   output_hidden_statesreturn_dictc                 t   ||n| j         j        }||n| j         j        }||n| j         j        }d}d}t	          | j                  D ]7\  }	}
|r||fz   } |
||||	         |          }|d         }|r||d         fz   }8|r||fz   }|s|f}|r||fz   }|r||fz   }|S t          ||r|nd |r|nd           S )Nr!   r   r   )last_hidden_stater   r   )rZ   r/  r   r.  	enumerater,  r	   )r3   r   r   r   r   r.  r/  all_hidden_statesall_attentionsilayer_modulelayer_outputsr  s                r#   rS   zTvpEncoder.forward  sX    &1%<kk$+BY1B1N--TXT_Tq$8$D  $+Jj 	 (44 	F 	FOA|# I$58H$H!(L	RSVghhM)!,M  F!/=3C2E!E   	E 1]4D D 	$&G# 9!%6$88  6!^$55N+/CM++):D~~
 
 
 	
r"   )NNNNN)
r   r   r   r,   r   r   r   r   rS   rT   rU   s   @r#   r$  r$    s        , , , , , 15,0/3&*+
 +
 E-.	+

 $D>+
 'tn+
 d^+
 +
 +
 +
 +
 +
 +
 +
r"   r$  c                   B     e Zd Z fdZdej        dej        fdZ xZS )	TvpPoolerc                     t                                                       t          j        |j        |j                  | _        t          j                    | _        d S r  )r+   r,   r   r   r\   r   Tanh
activationr   s     r#   r,   zTvpPooler.__init__  sC    Yv163EFF
'))r"   r   r   c                 r    |d d df         }|                      |          }|                     |          }|S )Nr   )r   r<  )r3   r   first_token_tensorpooled_outputs       r#   rS   zTvpPooler.forward  s@     +111a40

#56666r"   r  rU   s   @r#   r9  r9    s^        $ $ $ $ $
U\ el        r"   r9  c                   8    e Zd ZU eed<   dZdZdej        fdZ	dS )TvpPreTrainedModelrZ   modelTmodulec                    t          |t          j        t          j        f          r,|j        j                            d| j        j                   nt          |t          j	                  r>|j
        j                                         |j        j                            d           nt          |t          j                  rTt          j                            |j        dd           |j
        %t          j                            |j
        d           n9t          |t"                    r$t          j                            |j                   t          |t          j                  r%|j
        |j
        j                                         t'          |d	          r$t          j                            |j                   t'          |d
          r$t          j                            |j                   t'          |d          r$t          j                            |j                   t'          |d          r&t          j                            |j                   dS dS )zInitialize the weights        )r   stdg      ?fan_outrr   )r   nonlinearityNr   pad_uppad_downpad_left	pad_right)r  r   r   r   weightdatanormal_rZ   initializer_ranger   ra   zero_fill_re   initkaiming_normal_	constant_TvpModeltext_promptrd   rI  rJ  rK  rL  )r3   rC  s     r#   _init_weightsz TvpPreTrainedModel._init_weights-  s   fry",788 	0 M&&CT[5R&SSSS-- 	0K""$$$M$$S))))	** 	0G##FM	PV#WWW{&!!&+q111)) 	0GOOF.///fbi(( 	%V[-DK""$$$68$$ 	+GOOFM***6:&& 	-GOOFO,,,6:&& 	-GOOFO,,,6;'' 	.GOOF,-----	. 	.r"   N)
r   r   r   r   r   base_model_prefixsupports_gradient_checkpointingr   ModulerX  r!   r"   r#   rA  rA  '  sK         &*#.BI . . . . . .r"   rA  c                   (     e Zd ZdZ fdZd Z xZS )TvpFrameDownPadPrompterz>
    Pad frames extracted from videos only at the bottom.
    c           	      V   |j         dvrt          d          t                                                       |j        | _        |j        | _        |j        | _        |j         | _         t          j        t          j
        d|j        d|j        |j        g                    | _        d S )NrB   replaceremove9`visual_prompter_apply` must be in (add, replace, remove)r   r   )visual_prompter_applyr1   r+   r,   visual_prompt_size	frame_nummax_img_sizer   	Parameterr   randnrJ  r   s     r#   r,   z TvpFrameDownPadPrompter.__init__N  s    '/KKKXYYY"(";)"/%+%A"KF,a1JFL_`aa
 
r"   c                    | j         dk    rOt          j        | j        | j        g|j        |j                  }d|| j        | j        z
  | j        d d f<   ||z  }| j         dk    rt          j        |j        d         |j        d         d| j        | j        g|j                  }| j        | j        z
  }| j	        |d d d d d d || j        d d f<   ||
                    |j                  z  }|S )	NrB   r   rE  ra  r   r   r   r   )rc  r   r   rf  r   r   rd  r   rn   rJ  to)r3   rt   visual_prompt_maskpromptstart_points        r#   rS   zTvpFrameDownPadPrompter.forward\  s(   %..!&"D$56l>PYeYl" " " fit043JJTM^^`a`a`aab..L%11[#A&(:1(=q$BSUYUfg#*  F +d.EEKBF-F111aaaK$*;;QQQ>?FIIl&8999Lr"   r   rU   s   @r#   r]  r]  I  sQ         
 
 
 
 
      r"   r]  c                   \     e Zd ZdZ fdZdej        dededej        fdZdd	e	fd
Z
 xZS )TvpFramePadPrompterz?
    Pad frames extracted from videos in the surroundings.
    c           
         |j         dvrt          d          t                                                       |j        | _        |j        | _        |j         | _         |j        |j        dz  z
  | _        t          j	        t          j        d|j        d|j        |j        g                    | _        t          j	        t          j        d|j        d|j        |j        g                    | _        t          j	        t          j        d|j        d|j        |j        dz  z
  |j        g                    | _        t          j	        t          j        d|j        d|j        |j        dz  z
  |j        g                    | _        d S )Nr_  rb  rj   r   r   )rc  r1   r+   r,   rv   rf  rd  	base_sizer   rg  r   rh  rI  rJ  rK  rL  r   s     r#   r,   zTvpFramePadPrompter.__init__s  sx   '/KKKXYYY +"/%+%A",v/H1/LLlKF-q&2KVM`abb
 
 KF-q&2KVM`abb
 
 K%'&*Ca*GG- 

 

 K%'&*Ca*GG- 

 

r"   rm  rx   ry   r   c                     || j         z  || j         z  }}|j        \  }}}}	}
|                    ||z  ||	|
          }t          j                            |||fdd          }|                    |||||          }|S )z
        This method allows to interpolate the pre-trained pad weights, to be able to use the model on collection of high
        resolution images (high resolution videos).

        r   Fr   )rf  rn   r   r   rp   r   )r3   rm  rx   ry   r   r   batchrv   channelsprompt_heightprompt_widths              r#   interpolate_pad_encodingz,TvpFramePadPrompter.interpolate_pad_encoding  s     $++UT5F-FBCI<@z8]L 
 2Hm\ZZ**b	 + 
 
 z8VUKKr"   Frx  c                    |r|j         d         |j         d         fn| j        | j        f\  }}| j        dvrt          d| j                   | j        dv r(t	          j        ||g|j        |j                  }||z  }| j        dv rt	          j        d| j	        d	| j
        | j
        |j        
          }t	          j        | j        || j        gd          }t	          j        | j        || j        gd	          }t	          j        |                    d          |gz            }|r|                     |||          }||                    |j                  z   }|S )Nr   rY   )rB   ra  r`  z$Invalid visual_prompter_apply value )r`  ra  r   )r`  rB   r   r   rj  rm   r   r   )rn   rf  rc  r1   r   r   r   r   r   rv   rr  catrK  rL  rI  rJ  r   rx  rk  )r3   rt   rx  rx   ry   rl  baserm  s           r#   rS   zTvpFramePadPrompter.forward  sz    (8\#\%7%;<<#T%67 	
 %-III`DD^``aaa%)>>>!&VUO<CU^j^q!r!r!r..L%);;;;q$/1dndn]i]pqqqDYtT^D!LLLFYVT]CKKKFY|0033vh>??F' N66vvuMM'&))L4F*G*GGLr"   r   )r   r   r   r   r,   r   r   r   rx  r   rS   rT   rU   s   @r#   rp  rp  n  s         $
 $
 $
 $
 $
Lu| S QT Y^Ye    0 d        r"   rp  )framedownpadframepadzw
    The bare Tvp Model transformer outputting BaseModelOutputWithPooling object without any specific head on top.
    )custom_introc                        e Zd Z fdZd Zd Zd Ze	 	 	 	 	 	 	 	 ddee	j
                 dee	j                 d	ee	j
                 d
ee	j                 dee         dee         dee         defd            Z xZS )rV  c                 b   t                                          |           || _        t          |          | _        t          |          | _        t          |          | _        t          |          | _
        t          |          | _        t          j        t          j        dd|j        g                    | _        t          j        |j                  | _        |j        t.          vrt1          d          t/          |j                 |          | _        |                                  d S )Nr   
   z:`visual_prompter_type` must be in (framedownpad, framepad))r+   r,   rZ   rW   vision_modelr   r   r   visual_embeddingsr$  encoderr9  poolerr   rg  r   rh  r\   rW  r   r   r   visual_prompter_typeTVP_PROMPTER_CLASSES_MAPPINGr1   visual_prompter	post_initr   s     r#   r,   zTvpModel.__init__  s       *622088!8!@!@!&))''<QF<N4O(P(PQQz&"<==&.JJJYZZZ;F<WXY_``r"   c                     | j         j        S r  r   r   )r3   s    r#   get_input_embeddingszTvpModel.get_input_embeddings  s    ..r"   c                     || j         _        d S r  r  )r3   r   s     r#   set_input_embeddingszTvpModel.set_input_embeddings  s    */'''r"   c                     |                                 D ]/\  }}| j        j        |         j                            |           0dS )zPrunes heads of the model.
        heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base class PreTrainedModel
        N)itemsr  r,  r  r   )r3   heads_to_pruner,  r   s       r#   _prune_headszTvpModel._prune_heads  sU     +0022 	C 	CLE5Lu%/;;EBBBB	C 	Cr"   NFr   rt   r   r   r   r.  r/  r   c	                 6   ||n| j         j        }|                     |                     ||                    }|                     |          }	|                     ||          }
||                    |
j        dd                   }t          j	        |j        d         d          
                    |j        |j                  }t          j        |||gd	
          }|                     ||                                          
                    |j                  }| j                            |	j        d         d	d	          }t          j        ||	|
gd
          }|                     |||                     || j         j                  |||          }|r|j        n|d         }|                     |          }|                     |          }|                     |          }|s||f|dd         z   S t1          |||j        |j                  S )a  
        Examples:
        ```python
        >>> import torch
        >>> from transformers import AutoConfig, AutoTokenizer, TvpModel

        >>> model = TvpModel.from_pretrained("Jiqing/tiny-random-tvp")

        >>> tokenizer = AutoTokenizer.from_pretrained("Jiqing/tiny-random-tvp")

        >>> pixel_values = torch.rand(1, 1, 3, 448, 448)
        >>> text_inputs = tokenizer("This is an example input", return_tensors="pt")
        >>> output = model(text_inputs.input_ids, pixel_values, text_inputs.attention_mask)
        ```N)rx  )r   r   rj   r   r  )r   r   rY   r   r   )r   r   r   r.  r/  )r1  pooler_outputr   r   )rZ   r/  r  r  r   r  new_onesrn   r   r   rk  r   r   rz  get_extended_attention_maskr   rW  r   r  get_head_maskr+  r1  r  r   r
   r   r   )r3   r   rt   r   r   r   r.  r/  r   text_embedding_outputvisual_embedding_outputvisual_attention_maskpt_maskrW  embedding_outputencoder_outputsr1  r?  s                     r#   rS   zTvpModel.forward  sS   4 &1%<kk$+BY((  H` aa
 
 !%) D D"&"8"83K #9 #
 #
 %$2$;$;<S<YZ\[\Z\<]$^$^!j!5a!8"==@@%,N4H A  G #YAV'W]_```N "==ninnN^N^__bbclcsttN&--.C.I!.LbRTUU 9k3HJa%bhijjj,,)((DK4QRR/!5# ' 
 
 BMdO==RabcRd$566 LL):;;]33 	L%}58KKK)/')7&1	
 
 
 	
r"   )NNNNNNNF)r   r   r   r,   r  r  r  r   r   r   
LongTensorr   r   rS   rT   rU   s   @r#   rV  rV    s2            / / /0 0 0C C C  15485915,0/3&*).F
 F
E,-F
 u01F
 !!12	F

 E-.F
 $D>F
 'tnF
 d^F
 #'F
 F
 F
 ^F
 F
 F
 F
 F
r"   rV  c                   $     e Zd Z fdZd Z xZS )TvpVideoGroundingHeadc                 :   t                                                       t          j        |j        |j        dz            | _        t          j        |j        dz  d          | _        t          j                    | _        t          j	                    | _
        d S )Nrj   )r+   r,   r   r   r\   layer_0layer_1ReLUactivation_0Sigmoidactivation_1r   s     r#   r,   zTvpVideoGroundingHead.__init__<  st    y!3V5G!5KLLy!3a!7;;GIIJLLr"   c                     |                      |                     |                    }|                     |                     |                    }|S r  )r  r  r  r  )r3   r  r   s      r#   rS   zTvpVideoGroundingHead.forwardC  sE    ""4<<#>#>??""4<<#7#788r"   r   rU   s   @r#   r  r  ;  sG        ) ) ) ) )      r"   r  zb
    Tvp Model with a video grounding head on top computing IoU, distance, and duration loss.
    c                       e Zd Z fdZe	 	 	 	 	 	 	 	 	 ddeej                 deej                 deej                 dee	ej
                          deej                 d	ee         d
ee         dee         defd            Z xZS )TvpForVideoGroundingc                     t                                          |           || _        t          |          | _        t          |          | _        |                                  d S r  )r+   r,   rZ   rV  rB  r  video_grounding_headr  r   s     r#   r,   zTvpForVideoGrounding.__init__O  sW       f%%
$9&$A$A!r"   NFr   rt   r   rP   r   r   r.  r/  r   c
           
         ||n| j         j        }|                     ||||||||	          }
|
d         }|                     |          }d}|kt	          g d          }|                    | j                    |||          }|d         | j         j        |d         z  z   | j         j        |d         z  z   }|s|f|
dd         z   }
||f|
z   }
|
S t          |||
j
        |
j        	          S )
a  
        labels (`torch.FloatTensor` of shape `(batch_size, 3)`, *optional*):
            The labels contains duration, start time, and end time of the video corresponding to the text.

        Examples:
        ```python
        >>> import torch
        >>> from transformers import AutoConfig, AutoTokenizer, TvpForVideoGrounding

        >>> model = TvpForVideoGrounding.from_pretrained("Jiqing/tiny-random-tvp")

        >>> tokenizer = AutoTokenizer.from_pretrained("Jiqing/tiny-random-tvp")

        >>> pixel_values = torch.rand(1, 1, 3, 448, 448)
        >>> text_inputs = tokenizer("This is an example input", return_tensors="pt")
        >>> output = model(text_inputs.input_ids, pixel_values, text_inputs.attention_mask)
        ```N)r   r   r.  r/  r   r   r'   r(   r)   r*   rj   )r   r   r   r   )rZ   r/  rB  r  r%   rk  r   distance_loss_weightduration_loss_weightr   r   r   )r3   r   rt   r   rP   r   r   r.  r/  r   r  r  r   r   	criterion	loss_dicts                   r#   rS   zTvpForVideoGrounding.forwardW  sD   < &1%<kk$+BY**/!5#%=  	
 	
  
**=99 ? ? ?@@ILL%%%!	&&11I% +2Yz5JJK+2Yz5JJK 
  	i'!""+-G'G+N&!/)	
 
 
 	
r"   )	NNNNNNNNF)r   r   r   r,   r   r   r   r  r   r    r   r   rS   rT   rU   s   @r#   r  r  I  s             1548590415,0/3&*).@
 @
E,-@
 u01@
 !!12	@

 u|,-@
 E-.@
 $D>@
 'tn@
 d^@
 #'@
 @
 @
 ^@
 @
 @
 @
 @
r"   r  )rV  rA  r  )2r   r   dataclassesr   typingr   r   r   activationsr   modeling_layersr   modeling_outputsr	   r
   r   modeling_utilsr   pytorch_utilsr   utilsr   r   utils.backbone_utilsr   configuration_tvpr   
get_loggerr   loggerr   r[  r%   rW   r   r   r   r	  r  r  r$  r9  rA  r]  rp  r  rV  r  r  __all__r!   r"   r#   <module>r     s      ! ! ! ! ! !              ! ! ! ! ! ! 9 9 9 9 9 9 X X X X X X X X X X - - - - - - / / / / / / , , , , , , , , 1 1 1 1 1 1 ( ( ( ( ( ( 
	H	%	% ? ? ? ? ?k ? ?  ?$M M M M Mbi M M M`% % % % %RY % % %Pn n n n nbi n n nb! ! ! ! !RY ! ! !H_ _ _ _ _29 _ _ _F    bi       RY       /   82
 2
 2
 2
 2
 2
 2
 2
l    	    . . . . . . . .B" " " " "bi " " "JW W W W W") W W Wv ,#       
e
 e
 e
 e
 e
! e
 e
 
e
P    BI      
J
 J
 J
 J
 J
- J
 J
 
J
Z E
D
Dr"   