
     `i<                         d dl mZ ddlmZmZ  G d de          Z G d de          Z G d d	e          Z G d
 de          Zg dZ	dS )   )PretrainedConfig   )CONFIG_MAPPING
AutoConfigc                   L     e Zd ZdZdZdZdeiZ	 	 	 	 	 	 	 	 	 	 	 	 d fd	Z xZ	S )EdgeTamVisionConfiga  
    This is the configuration class to store the configuration of a [`EdgeTamVisionModel`]. It is used to instantiate a SAM
    vision encoder according to the specified arguments, defining the model architecture. Instantiating a configuration
    defaults will yield a similar configuration to that of SAM 2.1 Hiera-tiny
    [facebook/EdgeTAM](https://huggingface.co/facebook/EdgeTAM) architecture.

    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PretrainedConfig`] for more information.

    Args:
        backbone_config (`Union[dict, "PretrainedConfig"]`, *optional*):
            Configuration for the vision backbone. This is used to instantiate the backbone using
            `AutoModel.from_config`.
        backbone_channel_list (`List[int]`, *optional*, defaults to `[384, 192, 96, 48]`):
            The list of channel dimensions for the backbone.
        backbone_feature_sizes (`List[List[int]]`, *optional*, defaults to `[[256, 256], [128, 128], [64, 64]]`):
            The spatial sizes of the feature maps from the backbone.
        fpn_hidden_size (`int`, *optional*, defaults to 256):
            The hidden dimension of the FPN.
        fpn_kernel_size (`int`, *optional*, defaults to 1):
            The kernel size for the convolutions in the neck.
        fpn_stride (`int`, *optional*, defaults to 1):
            The stride for the convolutions in the neck.
        fpn_padding (`int`, *optional*, defaults to 0):
            The padding for the convolutions in the neck.
        fpn_top_down_levels (`List[int]`, *optional*, defaults to `[2, 3]`):
            The levels for the top-down FPN connections.
        num_feature_levels (`int`, *optional*, defaults to 3):
            The number of feature levels from the FPN to use.
        hidden_act (`str`, *optional*, defaults to `"gelu"`):
            The non-linear activation function in the neck.
        layer_norm_eps (`float`, *optional*, defaults to 1e-06):
            The epsilon for the layer normalization.
        initializer_range (`float`, *optional*, defaults to 0.02):
            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.

    vision_configedgetam_vision_modelbackbone_configN          r   geluư>{Gz?c                     t                      j        di | |g dn|}|ddgddgddggn|}|ddgn|}t          |t                    r2|                    dd          |d<   t          |d                  di |}n6t          |t                    r|}n|t          j        d	dd
g dd          }|| _        || _	        || _
        || _        || _        || _        || _        || _        |	| _        |
| _        || _        || _        d S )N)i     `   0   r      @   r   r   
model_typetimm_wrapperztimm/repvit_m1.dist_in1kT)r   r   r   r   )in_chansfeatures_onlyout_indices)
model_args )super__init__
isinstancedictgetr   r   from_pretrainedr   backbone_channel_listbackbone_feature_sizesfpn_hidden_sizefpn_kernel_size
fpn_stridefpn_paddingfpn_top_down_levelsnum_feature_levels
hidden_actlayer_norm_epsinitializer_range)selfr   r%   r&   r'   r(   r)   r*   r+   r,   r-   r.   r/   kwargs	__class__s                 /home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/transformers/models/edgetam/configuration_edgetam.pyr    zEdgeTamVisionConfig.__init__G   sv     	""6"""6K6S 2 2 2 2Yn2H2Pc3Z#sb"X..Vl 	 )<(Cq!ffI\ot,, 		,;,?,?n,],]OL),_\-JK^^o^^OO44 	-OO$(8*()DQ]Q]Q]^^  O
  / &;"&<#..$&#6 "4$,!2    )NNNr   r   r   r   Nr   r   r   r   )
__name__
__module____qualname____doc__base_config_keyr   r   sub_configsr    __classcell__r2   s   @r3   r   r      s        $ $L &O'J:K "# 13 13 13 13 13 13 13 13 13 13r4   r   c                   8     e Zd ZdZdZ	 	 	 	 	 	 	 	 d fd
	Z xZS )EdgeTamPromptEncoderConfigaB  
    This is the configuration class to store the configuration of a [`EdgeTamPromptEncoder`]. The [`EdgeTamPromptEncoder`]
    module is used to encode the input 2D points and bounding boxes.

    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PretrainedConfig`] for more information.

    Args:
        hidden_size (`int`, *optional*, defaults to 256):
            Dimensionality of the hidden states.
        image_size (`int`, *optional*, defaults to 1024):
            The expected output resolution of the image.
        patch_size (`int`, *optional*, defaults to 16):
            The size (resolution) of each patch.
        mask_input_channels (`int`, *optional*, defaults to 16):
            The number of channels to be fed to the `MaskDecoder` module.
        num_point_embeddings (`int`, *optional*, defaults to 4):
            The number of point embeddings to be used.
        hidden_act (`str`, *optional*, defaults to `"gelu"`):
            The non-linear activation function in the encoder and pooler.
        layer_norm_eps (`float`, *optional*, defaults to 1e-06):
            The epsilon used by the layer normalization layers.
        scale (`float`, *optional*, defaults to 1):
            The scale factor for the prompt encoder.
    prompt_encoder_configr            r   r   r   c	                      t                      j        di |	 || _        || _        || _        || _        || _        || _        || _        || _	        d S Nr   )
r   r    hidden_size
image_size
patch_sizemask_input_channelsnum_point_embeddingsr-   r.   scale)r0   rE   rF   rG   rH   rI   r-   r.   rJ   r1   r2   s             r3   r    z#EdgeTamPromptEncoderConfig.__init__   sb     	""6"""&$$#6 $8!$,


r4   )r   r@   rA   rA   rB   r   r   r   r5   r6   r7   r8   r9   r    r;   r<   s   @r3   r>   r>   {   sh         4 .O          r4   r>   c                   @     e Zd ZdZdZ	 	 	 	 	 	 	 	 	 	 	 	 d fd	Z xZS )EdgeTamMaskDecoderConfiga  
    This is the configuration class to store the configuration of a [`EdgeTamMaskDecoder`]. It is used to instantiate a EDGETAM
    memory encoder according to the specified arguments, defining the model architecture.

    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PretrainedConfig`] for more information.

    Args:
        hidden_size (`int`, *optional*, defaults to 256):
            Dimensionality of the hidden states.
        hidden_act (`str`, *optional*, defaults to `"gelu"`):
            The non-linear activation function in the EDGETAM mask decoder.
        mlp_dim (`int`, *optional*, defaults to 2048):
            The dimension of the MLP in the two-way transformer.
        num_hidden_layers (`int`, *optional*, defaults to 2):
            The number of hidden layers in the two-way transformer.
        num_attention_heads (`int`, *optional*, defaults to 8):
            The number of attention heads in the two-way transformer.
        attention_downsample_rate (`int`, *optional*, defaults to 2):
            The downsample rate for the attention layers.
        num_multimask_outputs (`int`, *optional*, defaults to 3):
            The number of multimask outputs.
        iou_head_depth (`int`, *optional*, defaults to 3):
            The depth of the IoU head.
        iou_head_hidden_dim (`int`, *optional*, defaults to 256):
            The hidden dimension of the IoU head.
        dynamic_multimask_via_stability (`bool`, *optional*, defaults to `True`):
            Whether to use dynamic multimask via stability.
        dynamic_multimask_stability_delta (`float`, *optional*, defaults to 0.05):
            The stability delta for the dynamic multimask.
        dynamic_multimask_stability_thresh (`float`, *optional*, defaults to 0.98):
            The stability threshold for the dynamic multimask.

    mask_decoder_configr   r      r      r   T皙?\(\?c                      t                      j        di | || _        || _        || _        || _        |	| _        |
| _        || _        || _	        || _
        || _        || _        || _        || _        d S rD   )r   r    rE   num_multimask_outputsr-   iou_head_depthiou_head_hidden_dimdynamic_multimask_via_stability!dynamic_multimask_stability_delta"dynamic_multimask_stability_threshnum_hidden_layersnum_attention_headsmlp_dimattention_downsample_rate)r0   rE   r-   r\   rZ   r[   r]   rT   rU   rV   rW   rX   rY   r1   r2   s                 r3   r    z!EdgeTamMaskDecoderConfig.__init__   s      	""6"""&%:"$,#6 /N,1R.2T/ "3&#6 )B&&&r4   )r   r   rO   r   rP   r   r   r   r   TrQ   rR   rK   r<   s   @r3   rM   rM      s        ! !F ,O "#(,*.+/ C  C  C  C  C  C  C  C  C  Cr4   rM   c                   <     e Zd ZdZdZeeedZ	 	 	 	 d fd	Z	 xZ
S )EdgeTamConfiga|	  
    [`EdgeTamConfig`] is the configuration class to store the configuration of a [`EdgeTamModel`]. It is used to instantiate a
    EDGETAM model according to the specified arguments, defining the memory attention, memory encoder, and image encoder
    configs. Instantiating a configuration defaults will yield a similar configuration to that of the SAM 2.1 Hiera-tiny
    [facebook/edgetam.1-hiera-tiny](https://huggingface.co/facebook/edgetam.1-hiera-tiny) architecture.

    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PretrainedConfig`] for more information.

    Args:
        vision_config (Union[`dict`, `EdgeTamVisionConfig`], *optional*):
            Dictionary of configuration options used to initialize [`EdgeTamVisionConfig`].
        prompt_encoder_config (Union[`dict`, `EdgeTamPromptEncoderConfig`], *optional*):
            Dictionary of configuration options used to initialize [`EdgeTamPromptEncoderConfig`].
        mask_decoder_config (Union[`dict`, `EdgeTamMaskDecoderConfig`], *optional*):
            Dictionary of configuration options used to initialize [`EdgeTamMaskDecoderConfig`].
        initializer_range (`float`, *optional*, defaults to 0.02):
            Standard deviation for parameter initialization.

    Example:

    ```python
    >>> from transformers import (
    ...     EdgeTamVisionConfig,
    ...     EdgeTamPromptEncoderConfig,
    ...     EdgeTamMaskDecoderConfig,
    ...     EdgeTamModel,
    ... )

    >>> # Initializing a EdgeTamConfig with `"facebook/edgetam.1_hiera_tiny"` style configuration
    >>> configuration = EdgeTamconfig()

    >>> # Initializing a EdgeTamModel (with random weights) from the `"facebook/edgetam.1_hiera_tiny"` style configuration
    >>> model = EdgeTamModel(configuration)

    >>> # Accessing the model configuration
    >>> configuration = model.config

    >>> # We can also initialize a EdgeTamConfig from a EdgeTamVisionConfig, EdgeTamPromptEncoderConfig, and EdgeTamMaskDecoderConfig

    >>> # Initializing EDGETAM vision encoder, memory attention, and memory encoder configurations
    >>> vision_config = EdgeTamVisionConfig()
    >>> prompt_encoder_config = EdgeTamPromptEncoderConfig()
    >>> mask_decoder_config = EdgeTamMaskDecoderConfig()

    >>> config = EdgeTamConfig(vision_config, prompt_encoder_config, mask_decoder_config)
    ```edgetam)r	   r?   rN   Nr   c                     t                      j        di | ||ni }||ni }||ni }t          |t                    r1|                    dd          |d<   t          |d                  di |}t          |t                    r|                                }t          |t                    r|                                }|| _	        t          di || _
        t          di || _        || _        d S )Nr   r
   r   )r   r    r!   r"   r#   r   r>   to_dictrM   r	   r?   rN   r/   )r0   r	   r?   rN   r/   r1   r2   s         r3   r    zEdgeTamConfig.__init__0  s1    	""6""")6)B9N9Z 5 5`b5H5T11Z\mT** 	Y*7*;*;LJ`*a*aM,'*=+FGXX-XXM+-GHH 	D$9$A$A$C$C!)+CDD 	@"5"="="?"?*%?%X%XBW%X%X"#;#R#R>Q#R#R !2r4   )NNNr   )r5   r6   r7   r8   r   r   r>   rM   r:   r    r;   r<   s   @r3   r_   r_      sr        . .` J#!;7 K " 3 3 3 3 3 3 3 3 3 3r4   r_   )r_   r   r>   rM   N)
configuration_utilsr   autor   r   r   r>   rM   r_   __all__r   r4   r3   <module>rf      s  , 4 3 3 3 3 3 - - - - - - - -^3 ^3 ^3 ^3 ^3* ^3 ^3 ^3B1 1 1 1 1!1 1 1 1hFC FC FC FC FC/ FC FC FCRQ3 Q3 Q3 Q3 Q3$ Q3 Q3 Q3h m
l
lr4   