
     `i%                     P   d Z ddlmZmZ ddlZddlmZ ddlZddlm	Z	m
Z
mZ ddlmZmZmZmZmZmZmZmZ ddlmZmZ ddlmZ dd	lmZ dd
lmZ ddlmZm Z  	 ddl!m"Z"  G d de          Z# G d de          Z$ G d de
          Z% G d de	          Z& G d de          Z' G d de          Z( G d de          Z) G d de          Z* G d d e          Z+e G d! d"e                      Z, ed#$           G d% d&e                      Z- G d' d(e          Z.g d)Z/dS )*zPyTorch SAM 2 model.    )OptionalUnionN)
Sam2ConfigSam2MaskDecoderConfigSam2PromptEncoderConfig)Sam2AttentionSam2FeedForwardSam2LayerNorm	Sam2ModelSam2PreTrainedModelSam2TwoWayAttentionBlockSam2VisionEncoderOutputSam2VisionModel)TransformersKwargscheck_model_inputs   )PretrainedConfig)Unpack)auto_docstring   )CONFIG_MAPPING
AutoConfigT)TimmWrapperModelc                   L     e Zd ZdZdZdZdeiZ	 	 	 	 	 	 	 	 	 	 	 	 d fd	Z xZ	S )EdgeTamVisionConfiga  
    This is the configuration class to store the configuration of a [`EdgeTamVisionModel`]. It is used to instantiate a SAM
    vision encoder according to the specified arguments, defining the model architecture. Instantiating a configuration
    defaults will yield a similar configuration to that of SAM 2.1 Hiera-tiny
    [facebook/EdgeTAM](https://huggingface.co/facebook/EdgeTAM) architecture.

    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PretrainedConfig`] for more information.

    Args:
        backbone_config (`Union[dict, "PretrainedConfig"]`, *optional*):
            Configuration for the vision backbone. This is used to instantiate the backbone using
            `AutoModel.from_config`.
        backbone_channel_list (`List[int]`, *optional*, defaults to `[384, 192, 96, 48]`):
            The list of channel dimensions for the backbone.
        backbone_feature_sizes (`List[List[int]]`, *optional*, defaults to `[[256, 256], [128, 128], [64, 64]]`):
            The spatial sizes of the feature maps from the backbone.
        fpn_hidden_size (`int`, *optional*, defaults to 256):
            The hidden dimension of the FPN.
        fpn_kernel_size (`int`, *optional*, defaults to 1):
            The kernel size for the convolutions in the neck.
        fpn_stride (`int`, *optional*, defaults to 1):
            The stride for the convolutions in the neck.
        fpn_padding (`int`, *optional*, defaults to 0):
            The padding for the convolutions in the neck.
        fpn_top_down_levels (`List[int]`, *optional*, defaults to `[2, 3]`):
            The levels for the top-down FPN connections.
        num_feature_levels (`int`, *optional*, defaults to 3):
            The number of feature levels from the FPN to use.
        hidden_act (`str`, *optional*, defaults to `"gelu"`):
            The non-linear activation function in the neck.
        layer_norm_eps (`float`, *optional*, defaults to 1e-06):
            The epsilon for the layer normalization.
        initializer_range (`float`, *optional*, defaults to 0.02):
            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.

    vision_configedgetam_vision_modelbackbone_configN      r   r   geluư>{Gz?c                     t                      j        di | |g dn|}|ddgddgddggn|}|ddgn|}t          |t                    r2|                    dd          |d<   t          |d                  di |}n6t          |t                    r|}n|t          j        d	dd
g dd          }|| _        || _	        || _
        || _        || _        || _        || _        || _        |	| _        |
| _        || _        || _        d S )N)i     `   0   r      @   r   r   
model_typetimm_wrapperztimm/repvit_m1.dist_in1kT)r   r    r   r   )in_chansfeatures_onlyout_indices)
model_args )super__init__
isinstancedictgetr   r   from_pretrainedr   backbone_channel_listbackbone_feature_sizesfpn_hidden_sizefpn_kernel_size
fpn_stridefpn_paddingfpn_top_down_levelsnum_feature_levels
hidden_actlayer_norm_epsinitializer_range)selfr   r7   r8   r9   r:   r;   r<   r=   r>   r?   r@   rA   kwargs	__class__s                 /home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/transformers/models/edgetam/modular_edgetam.pyr2   zEdgeTamVisionConfig.__init__^   sv     	""6"""6K6S 2 2 2 2Yn2H2Pc3Z#sb"X..Vl 	 )<(Cq!ffI\ot,, 		,;,?,?n,],]OL),_\-JK^^o^^OO44 	-OO$(8*()DQ]Q]Q]^^  O
  / &;"&<#..$&#6 "4$,!2    )NNNr   r    r    r   Nr   r!   r"   r#   )
__name__
__module____qualname____doc__base_config_keyr*   r   sub_configsr2   __classcell__)rD   s   @rE   r   r   1   s        $ $L &O'J:K "# 13 13 13 13 13 13 13 13 13 13rF   r   c                       e Zd ZdS )EdgeTamPromptEncoderConfigNrG   rH   rI   r0   rF   rE   rO   rO              DrF   rO   c                       e Zd ZdS )EdgeTamMaskDecoderConfigNrP   r0   rF   rE   rS   rS      rQ   rF   rS   c                       e Zd ZdS )EdgeTamConfigNrP   r0   rF   rE   rU   rU      rQ   rF   rU   c                       e Zd ZdS )EdgeTamLayerNormNrP   r0   rF   rE   rW   rW      rQ   rF   rW   c                       e Zd ZdS )EdgeTamVisionEncoderOutputNrP   r0   rF   rE   rY   rY      rQ   rF   rY   c                       e Zd ZdS )EdgeTamAttentionNrP   r0   rF   rE   r[   r[      rQ   rF   r[   c                       e Zd ZdS )EdgeTamTwoWayAttentionBlockNrP   r0   rF   rE   r]   r]      rQ   rF   r]   c                       e Zd ZdS )EdgeTamFeedForwardNrP   r0   rF   rE   r_   r_      rQ   rF   r_   c                       e Zd Zd ZdS )EdgeTamPreTrainedModelc                    | j         j        }t          |t          j        t          j        t          j        f          rG|j        j        	                    d|           |j
        |j
        j                                         nt          |t          j                  rR|j        j        	                    d|           |j        )|j        j        |j                                                  n^t          |t          j        t          f          r=|j        j                            d           |j
        j                                         t          |t"                    r'|j        "|j        j                                         d S d S d S )Ng        )meanstdg      ?)configrA   r3   nnLinearConv2dConvTranspose2dweightdatanormal_biaszero_	Embeddingpadding_idx	LayerNormrW   fill_EdgeTamModelno_memory_embedding)rB   modulerd   s      rE   _init_weightsz$EdgeTamPreTrainedModel._init_weights   sX   k+fry")R5GHII 
	%M&&CS&999{& &&(((-- 	%M&&CS&999!-"6#56<<>>>/? @AA 	%M$$S)))K""$$$fl++ 	8)5*/5577777	8 	855rF   N)rG   rH   rI   rv   r0   rF   rE   ra   ra      s#        8 8 8 8 8rF   ra   zN
    The vision model from EdgeTAM without any head or projection on top.
    )custom_introc            
           e Zd ZeZdZeedZd Ze		 dde
ej                 dee         deeef         fd            ZdS )	EdgeTamVisionModelpixel_values)hidden_states
attentionsc                      t          d          Nz2Can't get input embeddings from timm wrapper modelNotImplementedErrorrB   s    rE   get_input_embeddingsz'EdgeTamVisionModel.get_input_embeddings       !"VWWWrF   NrC   returnc                 8   |t          d          |                     |          }|j        }d |D             }|                     |          \  }}|| j         d          d d d         }|| j         d          d d d         }t          |d         ||          S )Nz You have to specify pixel_valuesc                 >    g | ]}|                     d ddd          S )r   r   r   r    )permute).0hidden_states     rE   
<listcomp>z.EdgeTamVisionModel.forward.<locals>.<listcomp>   s,    %v%v%v<l&:&:1aA&F&F%v%v%vrF   )last_hidden_statefpn_hidden_statesfpn_position_encoding)
ValueErrorbackboner   neckr>   rY   )rB   rz   rC   backbone_outputintermediate_hidden_statesr   r   s          rE   forwardzEdgeTamVisionModel.forward   s     ?@@@ --55%4%F"%v%v[u%v%v%v"3799=W3X3X00-t/F.F.H.HI$$B$O 5t7N6N6P6P QRVRVTVRV W)8</"7
 
 
 	
rF   )N)rG   rH   rI   r   config_classmain_input_namer   _can_record_outputsr   r   r   torchFloatTensorr   r   r   tuplerY   r   r0   rF   rE   ry   ry      s         'L$O,<L\]]X X X  59
 
u01
 +,
 
u00	1	
 
 
 
 
 
rF   ry   c                       e Zd Zg dZd ZdS )rs   )z
^memory_.*z^mask_downsample.*zspatial_perceiver.*z^object_pointer_proj.*z0^temporal_positional_encoding_projection_layer.*no_memory_positional_encodingno_object_pointer%occlusion_spatial_embedding_parameterc                      t          d          r~   r   r   s    rE   r   z!EdgeTamModel.get_input_embeddings   r   rF   N)rG   rH   rI   "_keys_to_ignore_on_load_unexpectedr   r0   rF   rE   rs   rs      s:        	* 	* 	*&X X X X XrF   rs   )rs   ry   ra   rU   r   rO   rS   )0rJ   typingr   r   r   torch.nnrf   torch.utils.checkpoint+transformers.models.sam2.configuration_sam2r   r   r   &transformers.models.sam2.modeling_sam2r   r	   r
   r   r   r   r   r   transformers.utils.genericr   r   configuration_utilsr   processing_utilsr   utilsr   autor   r   6transformers.models.timm_wrapper.modeling_timm_wrapperr   r   rO   rS   rU   rW   rY   r[   r]   r_   ra   ry   rs   __all__r0   rF   rE   <module>r      s     " " " " " " " "            r r r r r r r r r r	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 N M M M M M M M 3 3 3 3 3 3 & & & & & &      . - - - - - - -XWWWWWW^3 ^3 ^3 ^3 ^3* ^3 ^3 ^3B	 	 	 	 	!8 	 	 		 	 	 	 	4 	 	 		 	 	 	 	J 	 	 		 	 	 	 	} 	 	 		 	 	 	 	!8 	 	 		 	 	 	 	} 	 	 		 	 	 	 	": 	 	 		 	 	 	 	 	 	 	 8 8 8 8 80 8 8 8&   

 
 
 
 
 
 
 

DX X X X X9 X X X   rF   