
     `i-                        d Z ddlmZ ddlmZmZ ddlZddlmZ ddlm	Z	 ddl
mZ dd	lmZ dd
lmZmZmZmZ ddlmZ ddlmZ ddlmZ  ej        e          Ze ed           G d de                                  Ze G d de                      ZddZ G d dej                  Z  G d dej                  Z! ed           G d de                      Z"ddgZ#dS )zPyTorch VitPose model.    )	dataclass)OptionalUnionN)nn   )BackboneOutput)PreTrainedModel)Unpack)ModelOutputTransformersKwargsauto_docstringlogging)load_backbone)can_return_tuple   )VitPoseConfigz6
    Class for outputs of pose estimation models.
    )custom_introc                       e Zd ZU dZdZeej                 ed<   dZ	eej                 ed<   dZ
eeej        df                  ed<   dZeeej        df                  ed<   dS )VitPoseEstimatorOutputaH  
    loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
        Loss is not supported at this moment. See https://github.com/ViTAE-Transformer/ViTPose/tree/main/mmpose/models/losses for further detail.
    heatmaps (`torch.FloatTensor` of shape `(batch_size, num_keypoints, height, width)`):
        Heatmaps as predicted by the model.
    hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
        Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
        one for the output of each stage) of shape `(batch_size, sequence_length, hidden_size)`. Hidden-states
        (also called feature maps) of the model at the output of each stage.
    Nlossheatmaps.hidden_states
attentions)__name__
__module____qualname____doc__r   r   torchFloatTensor__annotations__r   r   tupler        /home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/transformers/models/vitpose/modeling_vitpose.pyr   r   %   s         	 	 )-D(5$
%,,,,0Hhu()000=AM8E%"3S"89:AAA:>Ju0#567>>>>>r#   r   c                   b    e Zd ZU eed<   dZdZdZdee	j
        e	j        e	j        f         fdZdS )VitPosePreTrainedModelconfigvitpixel_valuesTmodulec                 J   t          |t          j        t          j        f          rt          j                            |j        j                            t          j
                  d| j        j                                      |j        j                  |j        _        |j         |j        j                                         dS dS t          |t          j                  r?|j        j                                         |j        j                            d           dS dS )zInitialize the weightsg        )meanstdNg      ?)
isinstancer   LinearConv2dinittrunc_normal_weightdatator   float32r'   initializer_rangedtypebiaszero_	LayerNormfill_)selfr*   s     r$   _init_weightsz$VitPosePreTrainedModel._init_weightsD   s    fry")455 
	* "$!6!6"%%em443DKDa "7 " "b$%% M {& &&((((( '&-- 	*K""$$$M$$S)))))	* 	*r#   N)r   r   r   r   r    base_model_prefixmain_input_namesupports_gradient_checkpointingr   r   r/   r0   r;   r>   r"   r#   r$   r&   r&   =   s`         $O&*#*E")RY*L$M * * * * * *r#   r&   gaussian-heatmapc                    |dvrt          d          | j        dk    rt          d          | j        \  }}}}d}|dk    rd}| dddddd	f          | dddddd	f<   |                     |d
|||          } |                                 }|                                D ]/\  }	}
| dd|
d	f         |dd|	d	f<   | dd|	d	f         |dd|
d	f<   0|                    ||||f          }|                    d
          }|S )a  Flip the flipped heatmaps back to the original form.

    Args:
        output_flipped (`torch.tensor` of shape `(batch_size, num_keypoints, height, width)`):
            The output heatmaps obtained from the flipped images.
        flip_pairs (`torch.Tensor` of shape `(num_keypoints, 2)`):
            Pairs of keypoints which are mirrored (for example, left ear -- right ear).
        target_type (`str`, *optional*, defaults to `"gaussian-heatmap"`):
            Target type to use. Can be gaussian-heatmap or combined-target.
            gaussian-heatmap: Classification target with gaussian distribution.
            combined-target: The combination of classification target (response map) and regression target (offset map).
            Paper ref: Huang et al. The Devil is in the Details: Delving into Unbiased Data Processing for Human Pose Estimation (CVPR 2020).

    Returns:
        torch.Tensor: heatmaps that flipped back to the original image
    )rB   combined-targetz9target_type should be gaussian-heatmap or combined-target   zCoutput_flipped should be [batch_size, num_keypoints, height, width]r   rD   r   N.)
ValueErrorndimshapereshapeclonetolistflip)output_flipped
flip_pairstarget_type
batch_sizenum_keypointsheightwidthchannelsoutput_flipped_backleftrights              r$   	flip_backrY   S   si   " AAATUUUa^___/=/C,JvuH'''(6qqq!$Q$|(D'Dqqq!$Q$|$#++JHfeTTN(..00 "((** J Je,:111eS=,IAAAtSL)-;AAAtSL-IAAAucM**-55z=RXZ_6`aa-22266r#   c                   h     e Zd ZdZdef fdZd	dej        deej                 dej        fdZ	 xZ
S )
VitPoseSimpleDecoderz
    Simple decoding head consisting of a ReLU activation, 4x upsampling and a 3x3 convolution, turning the
    feature maps into heatmaps.
    r'   c                    t                                                       t          j                    | _        t          j        |j        dd          | _        t          j        |j	        j
        |j        ddd          | _        d S )NbilinearF)scale_factormodealign_cornersr   r   kernel_sizestridepadding)super__init__r   ReLU
activationUpsampler^   
upsamplingr0   backbone_confighidden_size
num_labelsconvr=   r'   	__class__s     r$   rf   zVitPoseSimpleDecoder.__init__   st    '))+63FZglmmmI".0AqYZde
 
 
			r#   Nhidden_staterO   returnc                     |                      |          }|                     |          }|                     |          }|t          ||          }|S N)rh   rj   rn   rY   r=   rq   rO   r   s       r$   forwardzVitPoseSimpleDecoder.forward   sO    |44|4499\**! :66Hr#   rt   r   r   r   r   r   rf   r   Tensorr   rv   __classcell__rp   s   @r$   r[   r[   {   s         

} 
 
 
 
 
 
	 	EL 	hu|>T 	`e`l 	 	 	 	 	 	 	 	r#   r[   c                   Z     e Zd ZdZdef fdZddej        deej                 fdZ	 xZ
S )	VitPoseClassicDecoderz
    Classic decoding head consisting of a 2 deconvolutional blocks, followed by a 1x1 convolution layer,
    turning the feature maps into heatmaps.
    r'   c                    t                                                       t          j        |j        j        ddddd          | _        t          j        d          | _        t          j	                    | _
        t          j        dddddd          | _        t          j        d          | _        t          j	                    | _        t          j        d|j        ddd          | _        d S )	N   rE      r   F)rb   rc   rd   r9   r   ra   )re   rf   r   ConvTranspose2drk   rl   deconv1BatchNorm2d
batchnorm1rg   relu1deconv2
batchnorm2relu2r0   rm   rn   ro   s     r$   rf   zVitPoseClassicDecoder.__init__   s    )".1VW^c
 
 
 .--WYY
)#s!UV]bccc.--WYY
Ic6#4!AWXYYY			r#   Nrq   rO   c                 P   |                      |          }|                     |          }|                     |          }|                     |          }|                     |          }|                     |          }|                     |          }|t          ||          }|S rt   )r   r   r   r   r   r   rn   rY   ru   s       r$   rv   zVitPoseClassicDecoder.forward   s    ||L11|44zz,//||L11|44zz,//99\**! :66Hr#   rt   rw   rz   s   @r$   r|   r|      s         
Z} Z Z Z Z Z Z EL hu|>T        r#   r|   z?
    The VitPose model with a pose estimation head on top.
    c                        e Zd Zdef fdZee	 	 	 ddej        de	ej                 de	ej                 de	ej                 de
e         d	efd
                        Z xZS )VitPoseForPoseEstimationr'   c                    t                                          |           t          |          | _        t	          | j        j        d          st          d          t	          | j        j        d          st          d          t	          | j        j        d          st          d          |j        rt          |          nt          |          | _
        |                                  d S )Nrl   z0The backbone should have a hidden_size attribute
image_sizez0The backbone should have an image_size attribute
patch_sizez/The backbone should have a patch_size attribute)re   rf   r   backbonehasattrr'   rG   use_simple_decoderr[   r|   head	post_initro   s     r$   rf   z!VitPoseForPoseEstimation.__init__   s       %f-- t}+];; 	QOPPPt}+\:: 	QOPPPt}+\:: 	PNOOO4:4Mp(000ShioSpSp	 	r#   Nr)   dataset_indexrO   labelskwargsrr   c                 ,   d}|t          d           | j        j        |fd|i|}|j        d         }|j        d         }	| j        j        j        d         | j        j        j        d         z  }
| j        j        j        d         | j        j        j        d         z  }|	                    ddd          }|
                    |	d|
|                                          }|                     ||          }t          |||j        |j        	          S )
ac  
        dataset_index (`torch.Tensor` of shape `(batch_size,)`):
            Index to use in the Mixture-of-Experts (MoE) blocks of the backbone.

            This corresponds to the dataset index used during training, e.g. For the single dataset index 0 refers to the corresponding dataset. For the multiple datasets index 0 refers to dataset A (e.g. MPII) and index 1 refers to dataset B (e.g. CrowdPose).
        flip_pairs (`torch.tensor`, *optional*):
            Whether to mirror pairs of keypoints (for example, left ear -- right ear).

        Examples:

        ```python
        >>> from transformers import AutoImageProcessor, VitPoseForPoseEstimation
        >>> import torch
        >>> from PIL import Image
        >>> import requests

        >>> processor = AutoImageProcessor.from_pretrained("usyd-community/vitpose-base-simple")
        >>> model = VitPoseForPoseEstimation.from_pretrained("usyd-community/vitpose-base-simple")

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)
        >>> boxes = [[[412.8, 157.61, 53.05, 138.01], [384.43, 172.21, 15.12, 35.74]]]
        >>> inputs = processor(image, boxes=boxes, return_tensors="pt")

        >>> with torch.no_grad():
        ...     outputs = model(**inputs)
        >>> heatmaps = outputs.heatmaps
        ```NzTraining is not yet supportedr   rF   r   r   r   )rO   )r   r   r   r   )NotImplementedErrorr   forward_with_filtered_kwargsfeature_mapsrI   r'   rk   r   r   permuterJ   
contiguousr   r   r   r   )r=   r)   r   rO   r   r   r   outputssequence_outputrQ   patch_heightpatch_widthr   s                r$   rv   z VitPoseForPoseEstimation.forward   s0   N %&EFFF"L$-"L#
 #
'#
 #
 #
 ".r2$*1-
{2=a@DKD_DjklDmmk1<Q?4;C^CijkCll)11!Q::)11*b,P[\\ggii99_9DD%!/)	
 
 
 	
r#   )NNN)r   r   r   r   rf   r   r   r   rx   r   r
   r   r   rv   ry   rz   s   @r$   r   r      s        }      $  15-1)->
 >
l>
  ->
 U\*	>

 &>
 +,>
 
 >
 >
 >
 ^ >
 >
 >
 >
 >
r#   r   )rB   )$r   dataclassesr   typingr   r   r   r   modeling_outputsr   modeling_utilsr	   processing_utilsr
   utilsr   r   r   r   utils.backbone_utilsr   utils.genericr   configuration_vitposer   
get_loggerr   loggerr   r&   rY   Moduler[   r|   r   __all__r"   r#   r$   <module>r      s]     ! ! ! ! ! ! " " " " " " " "        . . . . . . - - - - - - & & & & & & M M M M M M M M M M M M 1 1 1 1 1 1 - - - - - - 0 0 0 0 0 0 
	H	%	%
   
? ? ? ? ?[ ? ?  ?$ * * * * *_ * * **% % % %P    29   6# # # # #BI # # #L   
S
 S
 S
 S
 S
5 S
 S
 
S
l $%?
@r#   