
     `iw                     (   d Z ddlZddlmZ ddlmZmZmZ ddl	Z	ddl	m
Z
 ddlmZ ddlmZ dd	lmZmZ dd
lmZmZ ddlmZ ddlmZmZ ddlmZmZmZmZ ddlm Z m!Z! ddl"m#Z#  ej$        e%          Z&e ed           G d de                                  Z' G d de
j(                  Z) G d de
j(                  Z* G d de
j(                  Z+ G d de
j(                  Z,	 d@de
j(        de	j-        d e	j-        d!e	j-        d"ee	j-                 d#e.d$e.fd%Z/ G d& d'e
j(                  Z0 G d( d)e
j(                  Z1 G d* d+e
j(                  Z2 G d, d-e
j(                  Z3 G d. d/e
j(                  Z4 G d0 d1e          Z5 G d2 d3e
j(                  Z6e G d4 d5e                      Z7e G d6 d7e7                      Z8 G d8 d9e
j(                  Z9 G d: d;e
j(                  Z: ed<           G d= d>e7                      Z;g d?Z<dS )AzPyTorch YOLOS model.    N)	dataclass)CallableOptionalUnion)nn   )ACT2FN)GradientCheckpointingLayer)BaseModelOutputBaseModelOutputWithPooling)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack) find_pruneable_heads_and_indicesprune_linear_layer)ModelOutputTransformersKwargsauto_docstringlogging)can_return_tuplecheck_model_inputs   )YolosConfigz5
    Output type of [`YolosForObjectDetection`].
    )custom_introc                   D   e Zd ZU dZdZeej                 ed<   dZ	ee
         ed<   dZeej                 ed<   dZeej                 ed<   dZeee
                  ed<   dZeej                 ed<   dZeeej                          ed	<   dZeeej                          ed
<   dS )YolosObjectDetectionOutputa0  
    loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` are provided)):
        Total loss as a linear combination of a negative log-likehood (cross-entropy) for class prediction and a
        bounding box loss. The latter is defined as a linear combination of the L1 loss and the generalized
        scale-invariant IoU loss.
    loss_dict (`Dict`, *optional*):
        A dictionary containing the individual losses. Useful for logging.
    logits (`torch.FloatTensor` of shape `(batch_size, num_queries, num_classes + 1)`):
        Classification logits (including no-object) for all queries.
    pred_boxes (`torch.FloatTensor` of shape `(batch_size, num_queries, 4)`):
        Normalized boxes coordinates for all queries, represented as (center_x, center_y, width, height). These
        values are normalized in [0, 1], relative to the size of each individual image in the batch (disregarding
        possible padding). You can use [`~YolosImageProcessor.post_process`] to retrieve the unnormalized bounding
        boxes.
    auxiliary_outputs (`list[Dict]`, *optional*):
        Optional, only returned when auxiliary losses are activated (i.e. `config.auxiliary_loss` is set to `True`)
        and labels are provided. It is a list of dictionaries containing the two above keys (`logits` and
        `pred_boxes`) for each decoder layer.
    last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
        Sequence of hidden-states at the output of the last layer of the decoder of the model.
    Nloss	loss_dictlogits
pred_boxesauxiliary_outputslast_hidden_statehidden_states
attentions)__name__
__module____qualname____doc__r   r   torchFloatTensor__annotations__r   dictr   r    r!   listr"   r#   tupler$        |/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/transformers/models/yolos/modeling_yolos.pyr   r   &   s          , )-D(5$
%,,, $Ix~$$$*.FHU&'....2J*+222.2xT
+22259x 129998<M8E%"345<<<59Ju01299999r0   r   c                   P     e Zd ZdZdeddf fdZdej        dej        fdZ xZ	S )YolosEmbeddingszT
    Construct the CLS token, detection tokens, position and patch embeddings.

    configreturnNc                 F   t                                                       t          j        t	          j        dd|j                            | _        t          j        t	          j        d|j        |j                            | _	        t          |          | _        | j        j        }t          j        t	          j        d||j        z   dz   |j                            | _        t          j        |j                  | _        t#          |          | _        || _        d S Nr   )super__init__r   	Parameterr)   zeroshidden_size	cls_tokennum_detection_tokensdetection_tokensYolosPatchEmbeddingspatch_embeddingsnum_patchesposition_embeddingsDropouthidden_dropout_probdropout$InterpolateInitialPositionEmbeddingsinterpolationr4   )selfr4   rB   	__class__s      r1   r9   zYolosEmbeddings.__init__S   s    ek!Q8J&K&KLL "U[F<WY_Yk-l-l m m 4V < <+7#%<K;)DDqH&J\]]$
 $
  z&"<==A&IIr0   pixel_valuesc                    |j         \  }}}}|                     |          }|                                \  }}}| j                            |dd          }	| j                            |dd          }
t          j        |	||
fd          }|                     | j	        ||f          }||z   }| 
                    |          }|S )Nr   dim)shaperA   sizer=   expandr?   r)   catrH   rC   rF   )rI   rK   
batch_sizenum_channelsheightwidth
embeddingsseq_len_
cls_tokensr?   rC   s               r1   forwardzYolosEmbeddings.forwardb   s    2>2D/
L&%**<88
!+!2!2
GQ ^**:r2>>
077
BKKY
J8HIqQQQ
 #001IFTY?[["55
\\*--
r0   
r%   r&   r'   r(   r   r9   r)   Tensorr\   __classcell__rJ   s   @r1   r3   r3   M   s{         
{ t      EL U\        r0   r3   c                   8     e Zd Zd fdZddej        fdZ xZS )rG   r5   Nc                 V    t                                                       || _        d S Nr8   r9   r4   rI   r4   rJ   s     r1   r9   z-InterpolateInitialPositionEmbeddings.__init__x   $    r0   i   i@  c                    |d d dd d f         }|d d d f         }|d d | j         j         d d d f         }|d d d| j         j         d d f         }|                    dd          }|j        \  }}}| j         j        d         | j         j        z  | j         j        d         | j         j        z  }
}	|                    |||	|
          }|\  }}|| j         j        z  || j         j        z  }}t          j        	                    |||fdd          }|
                    d                              dd          }t          j        |||fd          }|S )Nr   r      bicubicFrQ   modealign_cornersrN   )r4   r>   	transposerP   
image_size
patch_sizeviewr   
functionalinterpolateflattenr)   rS   )rI   	pos_embedimg_sizecls_pos_embeddet_pos_embedpatch_pos_embedrT   r<   rY   patch_heightpatch_widthrV   rW   new_patch_heightnew_patch_widthscale_pos_embeds                   r1   r\   z,InterpolateInitialPositionEmbeddings.forward|   s   !!!!Q'*%aaag.!!!!dk&F%F%H%H!!!"KL#AAAqDK,L+L'Laaa$OP)33Aq99+:+@(
K K"1%)??K"1%)?? " *..z;Vabb ,2dk6L,LeW[WbWmNm/-33#3_"EIej 4 
 
 *11!44>>q!DD)]O]$SYZ[[[r0   r5   Nrg   r%   r&   r'   r9   r)   r^   r\   r_   r`   s   @r1   rG   rG   w   s_              %,        r0   rG   c                   8     e Zd Zd fdZddej        fdZ xZS ) InterpolateMidPositionEmbeddingsr5   Nc                 V    t                                                       || _        d S rc   rd   re   s     r1   r9   z)InterpolateMidPositionEmbeddings.__init__   rf   r0   rg   c                    |d d d d dd d f         }|d d d f         }|d d d d | j         j         d d d f         }|d d d d d| j         j         d d f         }|                    dd          }|j        \  }}}}	| j         j        d         | j         j        z  | j         j        d         | j         j        z  }}
|                    ||z  ||
|          }|\  }}|| j         j        z  || j         j        z  }}t          j        	                    |||fdd          }|
                    d                              dd                                                              ||||z  |          }t          j        |||fd          }|S )	Nr   r   ri   r   rj   Frk   rN   )r4   r>   rn   rP   ro   rp   rq   r   rr   rs   rt   
contiguousr)   rS   )rI   ru   rv   rw   rx   ry   depthrT   r<   rY   rz   r{   rV   rW   r|   r}   r~   s                    r1   r\   z(InterpolateMidPositionEmbeddings.forward   s   !!!!QQQ111*-%aaag.!!!!QQQ)I(I(K(KQQQ"NO#AAAqqq!t{/O.O*OQRQRQR$RS)33Aq992A2G/z; K"1%)??K"1%)?? " *..uz/A;P\^ijj ,2dk6L,LeW[WbWmNm/-33#3_"EIej 4 
 
 ##A&&Yq!__Z\\T%%5%GUU	 	  )]O]$SYZ[[[r0   r   r   r   r`   s   @r1   r   r      s_              %,        r0   r   c                   F     e Zd ZdZ fdZdej        dej        fdZ xZS )r@   z
    This class turns `pixel_values` of shape `(batch_size, num_channels, height, width)` into the initial
    `hidden_states` (patch embeddings) of shape `(batch_size, seq_length, hidden_size)` to be consumed by a
    Transformer.
    c                    t                                                       |j        |j        }}|j        |j        }}t          |t          j        j	                  r|n||f}t          |t          j        j	                  r|n||f}|d         |d         z  |d         |d         z  z  }|| _        || _        || _        || _
        t          j        ||||          | _        d S )Nr   r   )kernel_sizestride)r8   r9   ro   rp   rU   r<   
isinstancecollectionsabcIterablerB   r   Conv2d
projection)rI   r4   ro   rp   rU   r<   rB   rJ   s          r1   r9   zYolosPatchEmbeddings.__init__   s    !'!2F4EJ
$*$79Kk#-j+/:R#S#SqZZZdfpYq
#-j+/:R#S#SqZZZdfpYq
!!}
15*Q-:VW=:XY$$(&)L+:^hiiir0   rK   r5   c                     |j         \  }}}}|| j        k    rt          d          |                     |                              d                              dd          }|S )NzeMake sure that the channel dimension of the pixel values match with the one set in the configuration.ri   r   )rP   rU   
ValueErrorr   rt   rn   )rI   rK   rT   rU   rV   rW   rX   s          r1   r\   zYolosPatchEmbeddings.forward   sm    2>2D/
L&%4,,,w   __\22::1==GG1MM
r0   )	r%   r&   r'   r(   r9   r)   r^   r\   r_   r`   s   @r1   r@   r@      sm         j j j j jEL U\        r0   r@           modulequerykeyvalueattention_maskscalingrF   c                    t          j        ||                    dd                    |z  }t          j                            |dt           j                                      |j                  }t          j        	                    ||| j
                  }|||z  }t          j        ||          }	|	                    dd                                          }	|	|fS )NrM   )rO   dtype)ptrainingr   ri   )r)   matmulrn   r   rr   softmaxfloat32tor   rF   r   r   )
r   r   r   r   r   r   rF   kwargsattn_weightsattn_outputs
             r1   eager_attention_forwardr      s     <s}}R'<'<==GL =((2U](SSVVW\WbccL =((6?([[L !#n4,|U33K''1--88::K$$r0   c            	            e Zd Zdef fdZ	 ddej        deej                 deej        ej        f         fdZ	 xZ
S )	YolosSelfAttentionr4   c                    t                                                       |j        |j        z  dk    r0t	          |d          s t          d|j         d|j         d          || _        |j        | _        t          |j        |j        z            | _        | j        | j        z  | _	        |j
        | _        | j        dz  | _        d| _        t          j        |j        | j	        |j                  | _        t          j        |j        | j	        |j                  | _        t          j        |j        | j	        |j                  | _        d S )	Nr   embedding_sizezThe hidden size z4 is not a multiple of the number of attention heads .g      F)bias)r8   r9   r<   num_attention_headshasattrr   r4   intattention_head_sizeall_head_sizeattention_probs_dropout_probdropout_probr   	is_causalr   Linearqkv_biasr   r   r   re   s     r1   r9   zYolosSelfAttention.__init__   sB    ::a??PVXhHiHi?76#5 7 737 7 7  
 #)#= #&v'9F<V'V#W#W !58PP"?/5Yv143EFO\\\
9V/1C&/ZZZYv143EFO\\\


r0   Nr#   	head_maskr5   c           
         |j         d         }|d| j        | j        f} |                     |          j        |                     dd          } |                     |          j        |                     dd          } |                     |          j        |                     dd          }t          }| j	        j
        dk    rt          | j	        j
                 } || ||||| j        | j        | j        sdn| j                  \  }	}
|	                                d d         | j        fz   }|	                    |          }	|	|
fS )	Nr   rM   r   ri   eagerr   )r   r   rF   r   )rP   r   r   r   rq   rn   r   r   r   r4   _attn_implementationr   r   r   r   r   rQ   r   reshape)rI   r#   r   rT   	new_shape	key_layervalue_layerquery_layerattention_interfacecontext_layerattention_probsnew_context_layer_shapes               r1   r\   zYolosSelfAttention.forward  sY    #(+
D$<d>VV	0DHH]++0)<FFq!LL	4djj//4i@JJ1aPP4djj//4i@JJ1aPP(?;+w66"9$+:Z"[)<)<nL#}CCC$2C	*
 	*
 	*
& #0"4"4"6"6ss";t?Q>S"S%--.EFFo--r0   rc   )r%   r&   r'   r   r9   r)   r^   r   r.   r\   r_   r`   s   @r1   r   r      s        ]{ ] ] ] ] ] ]* PT. ."\.6>u|6L.	u|U\)	*. . . . . . . .r0   r   c                   Z     e Zd ZdZdef fdZdej        dej        dej        fdZ xZ	S )YolosSelfOutputz
    The residual connection is defined in YolosLayer instead of here (as is the case with other models), due to the
    layernorm applied before each block.
    r4   c                     t                                                       t          j        |j        |j                  | _        t          j        |j                  | _        d S rc   )	r8   r9   r   r   r<   denserD   rE   rF   re   s     r1   r9   zYolosSelfOutput.__init__1  sJ    Yv163EFF
z&"<==r0   r#   input_tensorr5   c                 Z    |                      |          }|                     |          }|S rc   r   rF   rI   r#   r   s      r1   r\   zYolosSelfOutput.forward6  s*    

=11]33r0   r]   r`   s   @r1   r   r   +  s         
>{ > > > > > >
U\  RWR^        r0   r   c                   |     e Zd Zdef fdZdee         fdZd
dej	        de
ej	                 dej	        fd	Z xZS )YolosAttentionr4   c                     t                                                       t          |          | _        t	          |          | _        t                      | _        d S rc   )r8   r9   r   	attentionr   outputsetpruned_headsre   s     r1   r9   zYolosAttention.__init__>  sI    +F33%f--EEr0   headsc                    t          |          dk    rd S t          || j        j        | j        j        | j                  \  }}t          | j        j        |          | j        _        t          | j        j        |          | j        _        t          | j        j	        |          | j        _	        t          | j
        j        |d          | j
        _        | j        j        t          |          z
  | j        _        | j        j        | j        j        z  | j        _        | j                            |          | _        d S )Nr   r   rN   )lenr   r   r   r   r   r   r   r   r   r   r   r   union)rI   r   indexs      r1   prune_headszYolosAttention.prune_headsD  s   u::??F74>5t~7Y[_[l
 
u
  2$.2FNN/0BEJJ1$.2FNN.t{/@%QOOO .2^-ORUV[R\R\-\*'+~'IDNLn'n$ -33E::r0   Nr#   r   r5   c                 d    |                      ||          \  }}|                     ||          }|S rc   )r   r   )rI   r#   r   self_attn_outputrZ   r   s         r1   r\   zYolosAttention.forwardV  s4    "nn]IFF!-}==r0   rc   )r%   r&   r'   r   r9   r   r   r   r)   r^   r   r\   r_   r`   s   @r1   r   r   =  s        "{ " " " " " ";S ; ; ; ;$ U\ hu|>T `e`l        r0   r   c                   H     e Zd Zdef fdZdej        dej        fdZ xZS )YolosIntermediater4   c                    t                                                       t          j        |j        |j                  | _        t          |j        t                    rt          |j                 | _        d S |j        | _        d S rc   )r8   r9   r   r   r<   intermediate_sizer   r   
hidden_actstrr	   intermediate_act_fnre   s     r1   r9   zYolosIntermediate.__init__^  sn    Yv163KLL
f'-- 	9'-f.?'@D$$$'-'8D$$$r0   r#   r5   c                 Z    |                      |          }|                     |          }|S rc   )r   r   )rI   r#   s     r1   r\   zYolosIntermediate.forwardf  s,    

=1100??r0   	r%   r&   r'   r   r9   r)   r^   r\   r_   r`   s   @r1   r   r   ]  sj        9{ 9 9 9 9 9 9U\ el        r0   r   c                   V     e Zd Zdef fdZdej        dej        dej        fdZ xZS )YolosOutputr4   c                     t                                                       t          j        |j        |j                  | _        t          j        |j                  | _	        d S rc   )
r8   r9   r   r   r   r<   r   rD   rE   rF   re   s     r1   r9   zYolosOutput.__init__n  sJ    Yv79KLL
z&"<==r0   r#   r   r5   c                 d    |                      |          }|                     |          }||z   }|S rc   r   r   s      r1   r\   zYolosOutput.forwards  s4    

=11]33%4r0   r   r`   s   @r1   r   r   m  su        >{ > > > > > >
U\  RWR^        r0   r   c                   h     e Zd ZdZdef fdZd	dej        deej                 dej        fdZ	 xZ
S )

YolosLayerz?This corresponds to the Block class in the timm implementation.r4   c                 z   t                                                       |j        | _        d| _        t	          |          | _        t          |          | _        t          |          | _	        t          j        |j        |j                  | _        t          j        |j        |j                  | _        d S )Nr   eps)r8   r9   chunk_size_feed_forwardseq_len_dimr   r   r   intermediater   r   r   	LayerNormr<   layer_norm_epslayernorm_beforelayernorm_afterre   s     r1   r9   zYolosLayer.__init__~  s    '-'E$'//-f55!&)) "V-?VEZ [ [ [!|F,>FDYZZZr0   Nr#   r   r5   c                     |                      |          }|                     ||          }||z   }|                     |          }|                     |          }|                     ||          }|S rc   )r   r   r   r   r   )rI   r#   r   hidden_states_normattention_outputlayer_outputs         r1   r\   zYolosLayer.forward  sz    !22=AA>>*<iHH )=8 ++M::((66 {{<??r0   rc   )r%   r&   r'   r(   r   r9   r)   r^   r   r\   r_   r`   s   @r1   r   r   {  s        II[{ [ [ [ [ [ [ U\ hu|>T `e`l        r0   r   c                   h     e Zd Zdeddf fdZ	 d
dej        dededeej                 de	f
d	Z
 xZS )YolosEncoderr4   r5   Nc                    t                                                       | _        t          j        fdt          j                  D                       | _        d| _        dj	        d         j	        d         z  j
        dz  z  z   j        z   }j        r6t          j        t          j        j        dz
  d|j                            nd | _        j        rt%                    nd | _        d S )Nc                 .    g | ]}t                    S r/   )r   ).0rZ   r4   s     r1   
<listcomp>z)YolosEncoder.__init__.<locals>.<listcomp>  s!    #`#`#`1Jv$6$6#`#`#`r0   Fr   r   ri   )r8   r9   r4   r   
ModuleListrangenum_hidden_layerslayergradient_checkpointingro   rp   r>   use_mid_position_embeddingsr:   r)   r;   r<   mid_position_embeddingsr   rH   )rI   r4   
seq_lengthrJ   s    ` r1   r9   zYolosEncoder.__init__  s   ]#`#`#`#`fF^@_@_#`#`#`aa
&+# "1%(9!(<<@QST@TTUX^Xss 	 1	BL,q0&	     	$ JPIku=fEEEqur0   r#   rV   rW   r   c                 ,   | j         j        r|                     | j        ||f          }t	          | j                  D ]G\  }}|||         nd } |||          }| j         j        r|| j         j        dz
  k     r|||         z   }Ht          |          S )Nr   )r"   )r4   r  rH   r  	enumerater  r  r   )	rI   r#   rV   rW   r   $interpolated_mid_position_embeddingsilayer_modulelayer_head_masks	            r1   r\   zYolosEncoder.forward  s     ;2 	u373E3EdFbekmrds3t3t0(44 	\ 	\OA|.7.CillO(LHHM{6 \59::$14XYZ4[$[M????r0   rc   )r%   r&   r'   r   r9   r)   r^   r   r   r   r\   r_   r`   s   @r1   r   r     s        v{ vt v v v v v v: -1@ @|@ @ 	@
 EL)@ 
@ @ @ @ @ @ @ @r0   r   c                       e Zd ZU eed<   dZdZdZg ZdZ	dZ
dZdZeedZdeej        ej        ej        f         ddfd	ZdS )
YolosPreTrainedModelr4   vitrK   T)r#   r$   r   r5   Nc                    t          |t          j        t          j        f          rT|j        j                            d| j        j                   |j	         |j	        j        
                                 dS dS t          |t          j                  r?|j	        j        
                                 |j        j                            d           dS dS )zInitialize the weightsr   )meanstdNg      ?)r   r   r   r   weightdatanormal_r4   initializer_ranger   zero_r   fill_)rI   r   s     r1   _init_weightsz"YolosPreTrainedModel._init_weights  s    fry")455 	* M&&CT[5R&SSS{& &&((((( '&-- 	*K""$$$M$$S)))))	* 	*r0   )r%   r&   r'   r   r+   base_model_prefixmain_input_namesupports_gradient_checkpointing_no_split_modules_supports_sdpa_supports_flash_attn_supports_flex_attn_supports_attention_backendr   r   _can_record_outputsr   r   r   r   r   r  r/   r0   r1   r  r    s         $O&*#N"&#( 

*E")RY*L$M 
*RV 
* 
* 
* 
* 
* 
*r0   r  c                        e Zd Zddedef fdZdefdZdee	e
e	         f         ddfd	Z ed
          e	 	 ddeej                 deej                 dee         defd                        Z xZS )
YolosModelTr4   add_pooling_layerc                 J   t                                          |           || _        t          |          | _        t          |          | _        t          j        |j	        |j
                  | _        |rt          |          nd| _        |                                  dS )zv
        add_pooling_layer (bool, *optional*, defaults to `True`):
            Whether to add a pooling layer
        r   N)r8   r9   r4   r3   rX   r   encoderr   r   r<   r   	layernormYolosPoolerpooler	post_init)rI   r4   r%  rJ   s      r1   r9   zYolosModel.__init__  s    
 	   )&11#F++f&8f>STTT->Hk&)))D 	r0   r5   c                     | j         j        S rc   )rX   rA   )rI   s    r1   get_input_embeddingszYolosModel.get_input_embeddings  s    //r0   heads_to_pruneNc                     |                                 D ]/\  }}| j        j        |         j                            |           0dS )a	  
        Prunes heads of the model.

        Args:
            heads_to_prune (`dict`):
                See base class `PreTrainedModel`. The input dictionary must have the following format: {layer_num:
                list of heads to prune in this layer}
        N)itemsr'  r  r   r   )rI   r.  r  r   s       r1   _prune_headszYolosModel._prune_heads  sU     +0022 	C 	CLE5Lu%/;;EBBBB	C 	Cr0   F)tie_last_hidden_statesrK   r   r   c                 z   |t          d          |                     || j        j                  }|                     |          }|j        dd          \  }}|                     ||||          }|j        }|                     |          }| j	        | 	                    |          nd }	t          ||	          S )Nz You have to specify pixel_valuesr   )rV   rW   r   )r"   pooler_output)r   get_head_maskr4   r  rX   rP   r'  r"   r(  r*  r   )
rI   rK   r   r   embedding_outputrV   rW   encoder_outputssequence_outputpooled_outputs
             r1   r\   zYolosModel.forward  s     ?@@@ &&y$+2OPP	??<88$*233/+/<<V5I ,8 ,
 ,
 *;..998<8OO444UY)O[hiiiir0   )TNN)r%   r&   r'   r   boolr9   r@   r-  r,   r   r-   r1  r   r   r   r)   r^   r   r   r   r\   r_   r`   s   @r1   r$  r$    s.        { t      "0&: 0 0 0 0
C4T#Y+? 
CD 
C 
C 
C 
C u555 04,0j ju|,j EL)j +,	j
 
$j j j ^ 65j j j j jr0   r$  c                   H     e Zd Zdef fdZdej        dej        fdZ xZS )r)  r4   c                     t                                                       t          j        |j        |j                  | _        t          j                    | _        d S rc   )r8   r9   r   r   r<   r   Tanh
activationre   s     r1   r9   zYolosPooler.__init__&  sC    Yv163EFF
'))r0   r#   r5   c                 r    |d d df         }|                      |          }|                     |          }|S )Nr   )r   r?  )rI   r#   first_token_tensorr9  s       r1   r\   zYolosPooler.forward+  s@     +111a40

#56666r0   r   r`   s   @r1   r)  r)  %  sj        ${ $ $ $ $ $ $
U\ el        r0   r)  c                   (     e Zd ZdZ fdZd Z xZS )YolosMLPPredictionHeada  
    Very simple multi-layer perceptron (MLP, also called FFN), used to predict the normalized center coordinates,
    height and width of a bounding box w.r.t. an image.

    Copied from https://github.com/facebookresearch/detr/blob/master/models/detr.py

    c                     t                                                       || _        |g|dz
  z  }t          j        d t          |g|z   ||gz             D                       | _        d S )Nr   c              3   F   K   | ]\  }}t          j        ||          V  d S rc   )r   r   )r   nks      r1   	<genexpr>z2YolosMLPPredictionHead.__init__.<locals>.<genexpr>B  s0      #g#g1BIaOO#g#g#g#g#g#gr0   )r8   r9   
num_layersr   r   ziplayers)rI   	input_dim
hidden_dim
output_dimrI  hrJ   s         r1   r9   zYolosMLPPredictionHead.__init__>  so    $LJN+m#g#gYKRSOUVZdYeUe@f@f#g#g#gggr0   c                     t          | j                  D ]F\  }}|| j        dz
  k     r(t          j                             ||                    n
 ||          }G|S r7   )r  rK  rI  r   rr   relu)rI   xr
  r  s       r1   r\   zYolosMLPPredictionHead.forwardD  sd    !$+.. 	V 	VHAu01DOa4G0G0G""5588,,,UUSTXXAAr0   )r%   r&   r'   r(   r9   r\   r_   r`   s   @r1   rC  rC  5  sV         h h h h h      r0   rC  zy
    YOLOS Model (consisting of a ViT encoder) with object detection heads on top, for tasks such as COCO detection.
    c                        e Zd Zdef fdZej        j        d             Ze	e
	 d
dej        deee                  dee         defd	                        Z xZS )YolosForObjectDetectionr4   c                 6   t                                          |           t          |d          | _        t	          |j        |j        |j        dz   d          | _        t	          |j        |j        dd          | _        | 	                                 d S )NF)r%  r   r   )rL  rM  rN  rI     )
r8   r9   r$  r  rC  r<   
num_labelsclass_labels_classifierbbox_predictorr+  re   s     r1   r9   z YolosForObjectDetection.__init__P  s        f>>> (>(V5GTZTehiTivw(
 (
 (
$ 5(V5GTUbc
 
 

 	r0   c                 V    d t          |d d         |d d                   D             S )Nc                     g | ]
\  }}||d S ))r   r    r/   )r   abs      r1   r   z9YolosForObjectDetection._set_aux_loss.<locals>.<listcomp>h  s$    ggg41a1A..gggr0   rM   )rJ  )rI   outputs_classoutputs_coords      r1   _set_aux_lossz%YolosForObjectDetection._set_aux_lossc  s9    
 hg3}SbS?QS`adbdadSe;f;fggggr0   NrK   labelsr   r5   c           
      2    | j         |fi |}|j        }|dd| j        j         dddf         }|                     |          }|                     |                                          }d\  }}	}
|}d\  }}| j        j        rC|j        }|                     |          }|                     |                                          }| 	                    ||| j
        || j        ||          \  }}	}
t          ||	|||
|j        |j        |j                  S )a	  
        labels (`list[Dict]` of len `(batch_size,)`, *optional*):
            Labels for computing the bipartite matching loss. List of dicts, each dictionary containing at least the
            following 2 keys: `'class_labels'` and `'boxes'` (the class labels and bounding boxes of an image in the
            batch respectively). The class labels themselves should be a `torch.LongTensor` of len `(number of bounding
            boxes in the image,)` and the boxes a `torch.FloatTensor` of shape `(number of bounding boxes in the image,
            4)`.

        Examples:

        ```python
        >>> from transformers import AutoImageProcessor, AutoModelForObjectDetection
        >>> import torch
        >>> from PIL import Image
        >>> import requests

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> image_processor = AutoImageProcessor.from_pretrained("hustvl/yolos-tiny")
        >>> model = AutoModelForObjectDetection.from_pretrained("hustvl/yolos-tiny")

        >>> inputs = image_processor(images=image, return_tensors="pt")
        >>> outputs = model(**inputs)

        >>> # convert outputs (bounding boxes and class logits) to Pascal VOC format (xmin, ymin, xmax, ymax)
        >>> target_sizes = torch.tensor([image.size[::-1]])
        >>> results = image_processor.post_process_object_detection(outputs, threshold=0.9, target_sizes=target_sizes)[
        ...     0
        ... ]

        >>> for score, label, box in zip(results["scores"], results["labels"], results["boxes"]):
        ...     box = [round(i, 2) for i in box.tolist()]
        ...     print(
        ...         f"Detected {model.config.id2label[label.item()]} with confidence "
        ...         f"{round(score.item(), 3)} at location {box}"
        ...     )
        Detected remote with confidence 0.991 at location [46.48, 72.78, 178.98, 119.3]
        Detected remote with confidence 0.908 at location [336.48, 79.27, 368.23, 192.36]
        Detected cat with confidence 0.934 at location [337.18, 18.06, 638.14, 373.09]
        Detected cat with confidence 0.979 at location [10.93, 53.74, 313.41, 470.67]
        Detected remote with confidence 0.974 at location [41.63, 72.23, 178.09, 119.99]
        ```N)NNNr:  )r   r   r   r    r!   r"   r#   r$   )r  r"   r4   r>   rX  rY  sigmoidauxiliary_lossr#   loss_functiondevicer   r$   )rI   rK   ra  r   outputsr8  r   r    r   r   r!   r^  r_  r   s                 r1   r\   zYolosForObjectDetection.forwardj  sV   j /7dh|.N.Nv.N.N!3 *!!!dk.N-N-P-PRSRSRS*ST --o>>((99AACC
-=*i*+5(M={) L&4 $ < <\ J J $ 3 3L A A I I K K151C1CZmUb2 2.D). *!/%7!/)	
 	
 	
 		
r0   rc   )r%   r&   r'   r   r9   r)   jitunusedr`  r   r   r*   r   r-   r,   r   r   r   r\   r_   r`   s   @r1   rT  rT  J  s        {      & Yh h h  (,Q
 Q
'Q
 d$Q
 +,	Q

 
$Q
 Q
 Q
 ^ Q
 Q
 Q
 Q
 Q
r0   rT  )rT  r$  r  )r   )=r(   collections.abcr   dataclassesr   typingr   r   r   r)   r   activationsr	   modeling_layersr
   modeling_outputsr   r   modeling_utilsr   r   processing_utilsr   pytorch_utilsr   r   utilsr   r   r   r   utils.genericr   r   configuration_yolosr   
get_loggerr%   loggerr   Moduler3   rG   r   r@   r^   floatr   r   r   r   r   r   r   r   r  r$  r)  rC  rT  __all__r/   r0   r1   <module>r{     s7         ! ! ! ! ! ! , , , , , , , , , ,        ! ! ! ! ! ! 9 9 9 9 9 9 K K K K K K K K F F F F F F F F & & & & & & Q Q Q Q Q Q Q Q M M M M M M M M M M M M A A A A A A A A , , , , , , 
	H	%	%   
: : : : : : :  :B' ' ' ' 'bi ' ' 'T    29   :    ry   B    29   R % %I%<% 
% <	%
 U\*% % % % % %>1. 1. 1. 1. 1. 1. 1. 1.j    bi   $    RY   @    	    
 
 
 
 
") 
 
 
    +   <+@ +@ +@ +@ +@29 +@ +@ +@\ * * * * *? * * *8 =j =j =j =j =j% =j =j =j@    ")        RY   *   
n
 n
 n
 n
 n
2 n
 n
 
n
b L
K
Kr0   