§
     `ƒi‹<  ã                   óš   — d dl mZ ddlmZmZ  G d„ de¦  «        Z G d„ de¦  «        Z G d„ d	e¦  «        Z G d
„ de¦  «        Zg d¢Z	dS )é   )ÚPretrainedConfigé   )ÚCONFIG_MAPPINGÚ
AutoConfigc                   óL   ‡ — e Zd ZdZdZdZdeiZ	 	 	 	 	 	 	 	 	 	 	 	 dˆ fd„	Zˆ xZ	S )ÚEdgeTamVisionConfigaà  
    This is the configuration class to store the configuration of a [`EdgeTamVisionModel`]. It is used to instantiate a SAM
    vision encoder according to the specified arguments, defining the model architecture. Instantiating a configuration
    defaults will yield a similar configuration to that of SAM 2.1 Hiera-tiny
    [facebook/EdgeTAM](https://huggingface.co/facebook/EdgeTAM) architecture.

    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PretrainedConfig`] for more information.

    Args:
        backbone_config (`Union[dict, "PretrainedConfig"]`, *optional*):
            Configuration for the vision backbone. This is used to instantiate the backbone using
            `AutoModel.from_config`.
        backbone_channel_list (`List[int]`, *optional*, defaults to `[384, 192, 96, 48]`):
            The list of channel dimensions for the backbone.
        backbone_feature_sizes (`List[List[int]]`, *optional*, defaults to `[[256, 256], [128, 128], [64, 64]]`):
            The spatial sizes of the feature maps from the backbone.
        fpn_hidden_size (`int`, *optional*, defaults to 256):
            The hidden dimension of the FPN.
        fpn_kernel_size (`int`, *optional*, defaults to 1):
            The kernel size for the convolutions in the neck.
        fpn_stride (`int`, *optional*, defaults to 1):
            The stride for the convolutions in the neck.
        fpn_padding (`int`, *optional*, defaults to 0):
            The padding for the convolutions in the neck.
        fpn_top_down_levels (`List[int]`, *optional*, defaults to `[2, 3]`):
            The levels for the top-down FPN connections.
        num_feature_levels (`int`, *optional*, defaults to 3):
            The number of feature levels from the FPN to use.
        hidden_act (`str`, *optional*, defaults to `"gelu"`):
            The non-linear activation function in the neck.
        layer_norm_eps (`float`, *optional*, defaults to 1e-06):
            The epsilon for the layer normalization.
        initializer_range (`float`, *optional*, defaults to 0.02):
            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.

    Úvision_configÚedgetam_vision_modelÚbackbone_configNé   é   é    r   Úgeluçíµ ÷Æ°>ç{®Gáz”?c                 ó  •—  t          ¦   «         j        di |¤Ž |€g d¢n|}|€ddgddgddggn|}|€ddgn|}t          |t          ¦  «        r2|                     dd¦  «        |d<   t          |d                  di |¤Ž}n6t          |t          ¦  «        r|}n|€t          j        d	dd
g d¢dœ¬¦  «        }|| _        || _	        || _
        || _        || _        || _        || _        || _        |	| _        |
| _        || _        || _        d S )N)i€  éÀ   é`   é0   r   é€   é@   r   r   Ú
model_typeÚtimm_wrapperztimm/repvit_m1.dist_in1kT)r   r   r   r   )Úin_chansÚfeatures_onlyÚout_indices)Ú
model_args© )ÚsuperÚ__init__Ú
isinstanceÚdictÚgetr   r   Úfrom_pretrainedr   Úbackbone_channel_listÚbackbone_feature_sizesÚfpn_hidden_sizeÚfpn_kernel_sizeÚ
fpn_strideÚfpn_paddingÚfpn_top_down_levelsÚnum_feature_levelsÚ
hidden_actÚlayer_norm_epsÚinitializer_range)Úselfr   r%   r&   r'   r(   r)   r*   r+   r,   r-   r.   r/   ÚkwargsÚ	__class__s                 €ú…/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/transformers/models/edgetam/configuration_edgetam.pyr    zEdgeTamVisionConfig.__init__G   sv  ø€ ð  	‰ŒÔÐ"Ð"˜6Ð"Ð"Ð"à6KÐ6SÐ 2Ð 2Ð 2Ð 2ÐYnÐà2HÐ2Pˆc3ˆZ˜#˜s˜ b¨" XÐ.Ð.ÐVlð 	ð )<Ð(C˜q !˜f˜fÐI\Ðåo¥tÑ,Ô,ð 		Ø,;×,?Ò,?ÀÈnÑ,]Ô,]ˆO˜LÑ)Ý,¨_¸\Ô-JÔKÐ^Ð^ÈoÐ^Ð^ˆOˆOÝ˜­Ñ4Ô4ð 	Ø-ˆOˆOØÐ$Ý(Ô8Ø*Ø()¸DÐQ]ÐQ]ÐQ]Ð^Ð^ðñ ô ˆOð
  /ˆÔð &;ˆÔ"Ø&<ˆÔ#Ø.ˆÔØ.ˆÔØ$ˆŒØ&ˆÔØ#6ˆÔ Ø"4ˆÔà$ˆŒØ,ˆÔØ!2ˆÔÐÐó    )NNNr   r   r   r   Nr   r   r   r   )
Ú__name__Ú
__module__Ú__qualname__Ú__doc__Úbase_config_keyr   r   Úsub_configsr    Ú__classcell__©r2   s   @r3   r   r      s†   ø€ € € € € ð$ð $ðL &€OØ'€Jà˜:ð€Kð Ø"Ø#ØØØØØ ØØØØð13ð 13ð 13ð 13ð 13ð 13ð 13ð 13ð 13ð 13r4   r   c                   ó8   ‡ — e Zd ZdZdZ	 	 	 	 	 	 	 	 dˆ fd
„	Zˆ xZS )ÚEdgeTamPromptEncoderConfigaB  
    This is the configuration class to store the configuration of a [`EdgeTamPromptEncoder`]. The [`EdgeTamPromptEncoder`]
    module is used to encode the input 2D points and bounding boxes.

    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PretrainedConfig`] for more information.

    Args:
        hidden_size (`int`, *optional*, defaults to 256):
            Dimensionality of the hidden states.
        image_size (`int`, *optional*, defaults to 1024):
            The expected output resolution of the image.
        patch_size (`int`, *optional*, defaults to 16):
            The size (resolution) of each patch.
        mask_input_channels (`int`, *optional*, defaults to 16):
            The number of channels to be fed to the `MaskDecoder` module.
        num_point_embeddings (`int`, *optional*, defaults to 4):
            The number of point embeddings to be used.
        hidden_act (`str`, *optional*, defaults to `"gelu"`):
            The non-linear activation function in the encoder and pooler.
        layer_norm_eps (`float`, *optional*, defaults to 1e-06):
            The epsilon used by the layer normalization layers.
        scale (`float`, *optional*, defaults to 1):
            The scale factor for the prompt encoder.
    Úprompt_encoder_configr   é   é   é   r   r   r   c	                 óª   •—  t          ¦   «         j        di |	¤Ž || _        || _        || _        || _        || _        || _        || _        || _	        d S ©Nr   )
r   r    Úhidden_sizeÚ
image_sizeÚ
patch_sizeÚmask_input_channelsÚnum_point_embeddingsr-   r.   Úscale)r0   rE   rF   rG   rH   rI   r-   r.   rJ   r1   r2   s             €r3   r    z#EdgeTamPromptEncoderConfig.__init__˜   sb   ø€ ð 	‰ŒÔÐ"Ð"˜6Ð"Ð"Ð"Ø&ˆÔØ$ˆŒØ$ˆŒØ#6ˆÔ Ø$8ˆÔ!Ø$ˆŒØ,ˆÔØˆŒ
ˆ
ˆ
r4   )r   r@   rA   rA   rB   r   r   r   ©r5   r6   r7   r8   r9   r    r;   r<   s   @r3   r>   r>   {   sh   ø€ € € € € ðð ð4 .€Oð ØØØØØØØðð ð ð ð ð ð ð ð ð r4   r>   c                   ó@   ‡ — e Zd ZdZdZ	 	 	 	 	 	 	 	 	 	 	 	 dˆ fd„	Zˆ xZS )ÚEdgeTamMaskDecoderConfigaŠ  
    This is the configuration class to store the configuration of a [`EdgeTamMaskDecoder`]. It is used to instantiate a EDGETAM
    memory encoder according to the specified arguments, defining the model architecture.

    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PretrainedConfig`] for more information.

    Args:
        hidden_size (`int`, *optional*, defaults to 256):
            Dimensionality of the hidden states.
        hidden_act (`str`, *optional*, defaults to `"gelu"`):
            The non-linear activation function in the EDGETAM mask decoder.
        mlp_dim (`int`, *optional*, defaults to 2048):
            The dimension of the MLP in the two-way transformer.
        num_hidden_layers (`int`, *optional*, defaults to 2):
            The number of hidden layers in the two-way transformer.
        num_attention_heads (`int`, *optional*, defaults to 8):
            The number of attention heads in the two-way transformer.
        attention_downsample_rate (`int`, *optional*, defaults to 2):
            The downsample rate for the attention layers.
        num_multimask_outputs (`int`, *optional*, defaults to 3):
            The number of multimask outputs.
        iou_head_depth (`int`, *optional*, defaults to 3):
            The depth of the IoU head.
        iou_head_hidden_dim (`int`, *optional*, defaults to 256):
            The hidden dimension of the IoU head.
        dynamic_multimask_via_stability (`bool`, *optional*, defaults to `True`):
            Whether to use dynamic multimask via stability.
        dynamic_multimask_stability_delta (`float`, *optional*, defaults to 0.05):
            The stability delta for the dynamic multimask.
        dynamic_multimask_stability_thresh (`float`, *optional*, defaults to 0.98):
            The stability threshold for the dynamic multimask.

    Úmask_decoder_configr   r   é   r   é   r   Tçš™™™™™©?ç\Âõ(\ï?c                 óð   •—  t          ¦   «         j        di |¤Ž || _        || _        || _        || _        |	| _        |
| _        || _        || _	        || _
        || _        || _        || _        || _        d S rD   )r   r    rE   Únum_multimask_outputsr-   Úiou_head_depthÚiou_head_hidden_dimÚdynamic_multimask_via_stabilityÚ!dynamic_multimask_stability_deltaÚ"dynamic_multimask_stability_threshÚnum_hidden_layersÚnum_attention_headsÚmlp_dimÚattention_downsample_rate)r0   rE   r-   r\   rZ   r[   r]   rT   rU   rV   rW   rX   rY   r1   r2   s                 €r3   r    z!EdgeTamMaskDecoderConfig.__init__Õ   s   ø€ ð  	‰ŒÔÐ"Ð"˜6Ð"Ð"Ð"à&ˆÔØ%:ˆÔ"Ø$ˆŒØ,ˆÔØ#6ˆÔ Ø/NˆÔ,Ø1RˆÔ.Ø2TˆÔ/ð "3ˆÔØ&ˆÔØ#6ˆÔ ØˆŒØ)BˆÔ&Ð&Ð&r4   )r   r   rO   r   rP   r   r   r   r   TrQ   rR   rK   r<   s   @r3   rM   rM   ¯   s   ø€ € € € € ð!ð !ðF ,€Oð ØØØØØ"#ØØØØ(,Ø*.Ø+/ð Cð  Cð  Cð  Cð  Cð  Cð  Cð  Cð  Cð  Cr4   rM   c                   ó<   ‡ — e Zd ZdZdZeeedœZ	 	 	 	 dˆ fd„	Z	ˆ xZ
S )ÚEdgeTamConfiga|	  
    [`EdgeTamConfig`] is the configuration class to store the configuration of a [`EdgeTamModel`]. It is used to instantiate a
    EDGETAM model according to the specified arguments, defining the memory attention, memory encoder, and image encoder
    configs. Instantiating a configuration defaults will yield a similar configuration to that of the SAM 2.1 Hiera-tiny
    [facebook/edgetam.1-hiera-tiny](https://huggingface.co/facebook/edgetam.1-hiera-tiny) architecture.

    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PretrainedConfig`] for more information.

    Args:
        vision_config (Union[`dict`, `EdgeTamVisionConfig`], *optional*):
            Dictionary of configuration options used to initialize [`EdgeTamVisionConfig`].
        prompt_encoder_config (Union[`dict`, `EdgeTamPromptEncoderConfig`], *optional*):
            Dictionary of configuration options used to initialize [`EdgeTamPromptEncoderConfig`].
        mask_decoder_config (Union[`dict`, `EdgeTamMaskDecoderConfig`], *optional*):
            Dictionary of configuration options used to initialize [`EdgeTamMaskDecoderConfig`].
        initializer_range (`float`, *optional*, defaults to 0.02):
            Standard deviation for parameter initialization.

    Example:

    ```python
    >>> from transformers import (
    ...     EdgeTamVisionConfig,
    ...     EdgeTamPromptEncoderConfig,
    ...     EdgeTamMaskDecoderConfig,
    ...     EdgeTamModel,
    ... )

    >>> # Initializing a EdgeTamConfig with `"facebook/edgetam.1_hiera_tiny"` style configuration
    >>> configuration = EdgeTamconfig()

    >>> # Initializing a EdgeTamModel (with random weights) from the `"facebook/edgetam.1_hiera_tiny"` style configuration
    >>> model = EdgeTamModel(configuration)

    >>> # Accessing the model configuration
    >>> configuration = model.config

    >>> # We can also initialize a EdgeTamConfig from a EdgeTamVisionConfig, EdgeTamPromptEncoderConfig, and EdgeTamMaskDecoderConfig

    >>> # Initializing EDGETAM vision encoder, memory attention, and memory encoder configurations
    >>> vision_config = EdgeTamVisionConfig()
    >>> prompt_encoder_config = EdgeTamPromptEncoderConfig()
    >>> mask_decoder_config = EdgeTamMaskDecoderConfig()

    >>> config = EdgeTamConfig(vision_config, prompt_encoder_config, mask_decoder_config)
    ```Úedgetam)r	   r?   rN   Nr   c                 óî  •—  t          ¦   «         j        di |¤Ž ||ni }||ni }||ni }t          |t          ¦  «        r1|                     dd¦  «        |d<   t          |d                  di |¤Ž}t          |t          ¦  «        r|                     ¦   «         }t          |t          ¦  «        r|                     ¦   «         }|| _	        t          di |¤Ž| _
        t          di |¤Ž| _        || _        d S )Nr   r
   r   )r   r    r!   r"   r#   r   r>   Úto_dictrM   r	   r?   rN   r/   )r0   r	   r?   rN   r/   r1   r2   s         €r3   r    zEdgeTamConfig.__init__0  s1  ø€ ð 	‰ŒÔÐ"Ð"˜6Ð"Ð"Ð"Ø)6Ð)B˜˜ÈˆØ9NÐ9ZÐ 5Ð 5Ð`bÐØ5HÐ5TÐ1Ð1ÐZ\Ðåm¥TÑ*Ô*ð 	YØ*7×*;Ò*;¸LÐJ`Ñ*aÔ*aˆM˜,Ñ'Ý*¨=¸Ô+FÔGÐXÐXÈ-ÐXÐXˆMÝÐ+Õ-GÑHÔHð 	DØ$9×$AÒ$AÑ$CÔ$CÐ!ÝÐ)Õ+CÑDÔDð 	@Ø"5×"=Ò"=Ñ"?Ô"?Ðà*ˆÔÝ%?Ð%XÐ%XÐBWÐ%XÐ%XˆÔ"Ý#;Ð#RÐ#RÐ>QÐ#RÐ#RˆÔ à!2ˆÔÐÐr4   )NNNr   )r5   r6   r7   r8   r   r   r>   rM   r:   r    r;   r<   s   @r3   r_   r_   ø   sr   ø€ € € € € ð.ð .ð` €Jà#Ø!;Ø7ðð €Kð Ø"Ø Øð3ð 3ð 3ð 3ð 3ð 3ð 3ð 3ð 3ð 3r4   r_   )r_   r   r>   rM   N)
Úconfiguration_utilsr   Úautor   r   r   r>   rM   r_   Ú__all__r   r4   r3   ú<module>rf      s  ðð, 4Ð 3Ð 3Ð 3Ð 3Ð 3Ø -Ð -Ð -Ð -Ð -Ð -Ð -Ð -ð^3ð ^3ð ^3ð ^3ð ^3Ð*ñ ^3ô ^3ð ^3ðB1ð 1ð 1ð 1ð 1Ð!1ñ 1ô 1ð 1ðhFCð FCð FCð FCð FCÐ/ñ FCô FCð FCðRQ3ð Q3ð Q3ð Q3ð Q3Ð$ñ Q3ô Q3ð Q3ðh mÐ
lÐ
l€€€r4   