
     `i|9                         d Z ddlmZ ddlmZ  ej        e          Z G d de          Z G d de          Z	 G d d	e          Z
 G d
 de          Zg dZdS )zSAM model configuration   )PretrainedConfig)loggingc                   6     e Zd ZdZdZ	 	 	 	 	 	 	 d
 fd		Z xZS )SamPromptEncoderConfiga  
    This is the configuration class to store the configuration of a [`SamPromptEncoder`]. The [`SamPromptEncoder`]
    module is used to encode the input 2D points and bounding boxes. Instantiating a configuration defaults will yield
    a similar configuration to that of the SAM-vit-h
    [facebook/sam-vit-huge](https://huggingface.co/facebook/sam-vit-huge) architecture.

    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PretrainedConfig`] for more information.

    Args:
        hidden_size (`int`, *optional*, defaults to 256):
            Dimensionality of the hidden states.
        image_size (`int`, *optional*, defaults to 1024):
            The expected output resolution of the image.
        patch_size (`int`, *optional*, defaults to 16):
            The size (resolution) of each patch.
        mask_input_channels (`int`, *optional*, defaults to 16):
            The number of channels to be fed to the `MaskDecoder` module.
        num_point_embeddings (`int`, *optional*, defaults to 4):
            The number of point embeddings to be used.
        hidden_act (`str`, *optional*, defaults to `"gelu"`):
            The non-linear activation function in the encoder and pooler.
    prompt_encoder_config            geluư>c                      t                      j        di | || _        || _        || _        ||z  | _        || _        || _        || _        || _	        d S N )
super__init__hidden_size
image_size
patch_sizeimage_embedding_sizemask_input_channelsnum_point_embeddings
hidden_actlayer_norm_eps)
selfr   r   r   r   r   r   r   kwargs	__class__s
            }/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/transformers/models/sam/configuration_sam.pyr   zSamPromptEncoderConfig.__init__3   sj     	""6"""&$$$.*$<!#6 $8!$,    )r   r	   r
   r
   r   r   r   __name__
__module____qualname____doc__base_config_keyr   __classcell__r   s   @r   r   r      se         0 .O - - - - - - - - - -r   r   c                   <     e Zd ZdZdZ	 	 	 	 	 	 	 	 	 	 d fd
	Z xZS )SamMaskDecoderConfiga  
    This is the configuration class to store the configuration of a [`SamMaskDecoder`]. It is used to instantiate a SAM
    mask decoder to the specified arguments, defining the model architecture. Instantiating a configuration defaults
    will yield a similar configuration to that of the SAM-vit-h
    [facebook/sam-vit-huge](https://huggingface.co/facebook/sam-vit-huge) architecture.

    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PretrainedConfig`] for more information.

    Args:
        hidden_size (`int`, *optional*, defaults to 256):
            Dimensionality of the hidden states.
        hidden_act (`str`, *optional*, defaults to `"relu"`):
            The non-linear activation function used inside the `SamMaskDecoder` module.
        mlp_dim (`int`, *optional*, defaults to 2048):
            Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
        num_hidden_layers (`int`, *optional*, defaults to 2):
            Number of hidden layers in the Transformer encoder.
        num_attention_heads (`int`, *optional*, defaults to 8):
            Number of attention heads for each attention layer in the Transformer encoder.
        attention_downsample_rate (`int`, *optional*, defaults to 2):
            The downsampling rate of the attention layer.
        num_multimask_outputs (`int`, *optional*, defaults to 3):
            The number of outputs from the `SamMaskDecoder` module. In the Segment Anything paper, this is set to 3.
        iou_head_depth (`int`, *optional*, defaults to 3):
            The number of layers in the IoU head module.
        iou_head_hidden_dim (`int`, *optional*, defaults to 256):
            The dimensionality of the hidden states in the IoU head module.
        layer_norm_eps (`float`, *optional*, defaults to 1e-06):
            The epsilon used by the layer normalization layers.

    mask_decoder_configr   relu         r   r   c                      t                      j        di | || _        || _        || _        || _        || _        || _        || _        || _	        |	| _
        |
| _        d S r   )r   r   r   r   mlp_dimnum_hidden_layersnum_attention_headsattention_downsample_ratenum_multimask_outputsiou_head_depthiou_head_hidden_dimr   )r   r   r   r0   r1   r2   r3   r4   r5   r6   r   r   r   s               r   r   zSamMaskDecoderConfig.__init__m   sv     	""6"""&$!2#6 )B&%:",#6 ,r   )
r   r+   r,   r-   r.   r-   r   r   r   r   r    r'   s   @r   r)   r)   I   so         B ,O "#- - - - - - - - - -r   r)   c                   V     e Zd ZdZdZdZddddddd	d
ddddddddg dddf fd	Z xZS )SamVisionConfiga  
    This is the configuration class to store the configuration of a [`SamVisionModel`]. It is used to instantiate a SAM
    vision encoder according to the specified arguments, defining the model architecture. Instantiating a configuration
    defaults will yield a similar configuration to that of the SAM ViT-h
    [facebook/sam-vit-huge](https://huggingface.co/facebook/sam-vit-huge) architecture.

    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PretrainedConfig`] for more information.

    Args:
        hidden_size (`int`, *optional*, defaults to 768):
            Dimensionality of the encoder layers and the pooler layer.
        output_channels (`int`, *optional*, defaults to 256):
            Dimensionality of the output channels in the Patch Encoder.
        num_hidden_layers (`int`, *optional*, defaults to 12):
            Number of hidden layers in the Transformer encoder.
        num_attention_heads (`int`, *optional*, defaults to 12):
            Number of attention heads for each attention layer in the Transformer encoder.
        num_channels (`int`, *optional*, defaults to 3):
            Number of channels in the input image.
        image_size (`int`, *optional*, defaults to 1024):
            Expected resolution. Target size of the resized input image.
        patch_size (`int`, *optional*, defaults to 16):
            Size of the patches to be extracted from the input image.
        hidden_act (`str`, *optional*, defaults to `"gelu"`):
            The non-linear activation function (function or string)
        layer_norm_eps (`float`, *optional*, defaults to 1e-06):
            The epsilon used by the layer normalization layers.
        attention_dropout (`float`, *optional*, defaults to 0.0):
            The dropout ratio for the attention probabilities.
        initializer_range (`float`, *optional*, defaults to 1e-10):
            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
        qkv_bias (`bool`, *optional*, defaults to `True`):
            Whether to add a bias to query, key, value projections.
        mlp_ratio (`float`, *optional*, defaults to 4.0):
            Ratio of mlp hidden dim to embedding dim.
        use_abs_pos (`bool`, *optional*, defaults to `True`):
            Whether to use absolute position embedding.
        use_rel_pos (`bool`, *optional*, defaults to `True`):
            Whether to use relative position embedding.
        window_size (`int`, *optional*, defaults to 14):
            Window size for relative position.
        global_attn_indexes (`list[int]`, *optional*, defaults to `[2, 5, 8, 11]`):
            The indexes of the global attention layers.
        num_pos_feats (`int`, *optional*, defaults to 128):
            The dimensionality of the position embedding.
        mlp_dim (`int`, *optional*):
            The dimensionality of the MLP layer in the Transformer encoder. If `None`, defaults to `mlp_ratio *
            hidden_size`.

    Example:

    ```python
    >>> from transformers import (
    ...     SamVisionConfig,
    ...     SamVisionModel,
    ... )

    >>> # Initializing a SamVisionConfig with `"facebook/sam-vit-huge"` style configuration
    >>> configuration = SamVisionConfig()

    >>> # Initializing a SamVisionModel (with random weights) from the `"facebook/sam-vit-huge"` style configuration
    >>> model = SamVisionModel(configuration)

    >>> # Accessing the model configuration
    >>> configuration = model.config
    ```vision_configsam_vision_modeli   r      r   r	   r
   r   r   g        g|=Tg      @   )r-      r.         Nc                 l    t                      j        di | || _        || _        || _        || _        || _        || _        || _        || _	        |	| _
        |
| _        || _        || _        || _        || _        || _        || _        || _        || _        |t)          ||z            n|| _        d S r   )r   r   r   output_channelsr1   r2   num_channelsr   r   r   r   attention_dropoutinitializer_rangeqkv_bias	mlp_ratiouse_abs_posuse_rel_poswindow_sizeglobal_attn_indexesnum_pos_featsintr0   )r   r   rA   r1   r2   rB   r   r   r   r   rC   rD   rE   rF   rG   rH   rI   rJ   rK   r0   r   r   s                        r   r   zSamVisionConfig.__init__   s    . 	""6"""&.!2#6 ($$$,!2!2 "&&&#6 *7>s;2333Gr   )r!   r"   r#   r$   r%   
model_typer   r&   r'   s   @r   r8   r8      s        B BH &O#J )MM)+T +T +T +T +T +T +T +T +T +Tr   r8   c                   <     e Zd ZdZdZeeedZ	 	 	 	 d fd	Z	 xZ
S )	SamConfiga  
    [`SamConfig`] is the configuration class to store the configuration of a [`SamModel`]. It is used to instantiate a
    SAM model according to the specified arguments, defining the vision model, prompt-encoder model and mask decoder
    configs. Instantiating a configuration with the defaults will yield a similar configuration to that of the
    SAM-ViT-H [facebook/sam-vit-huge](https://huggingface.co/facebook/sam-vit-huge) architecture.

    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PretrainedConfig`] for more information.

    Args:
        vision_config (Union[`dict`, `SamVisionConfig`], *optional*):
            Dictionary of configuration options used to initialize [`SamVisionConfig`].
        prompt_encoder_config (Union[`dict`, `SamPromptEncoderConfig`], *optional*):
            Dictionary of configuration options used to initialize [`SamPromptEncoderConfig`].
        mask_decoder_config (Union[`dict`, `SamMaskDecoderConfig`], *optional*):
            Dictionary of configuration options used to initialize [`SamMaskDecoderConfig`].

        kwargs (*optional*):
            Dictionary of keyword arguments.

    Example:

    ```python
    >>> from transformers import (
    ...     SamVisionConfig,
    ...     SamPromptEncoderConfig,
    ...     SamMaskDecoderConfig,
    ...     SamModel,
    ... )

    >>> # Initializing a SamConfig with `"facebook/sam-vit-huge"` style configuration
    >>> configuration = SamConfig()

    >>> # Initializing a SamModel (with random weights) from the `"facebook/sam-vit-huge"` style configuration
    >>> model = SamModel(configuration)

    >>> # Accessing the model configuration
    >>> configuration = model.config

    >>> # We can also initialize a SamConfig from a SamVisionConfig, SamPromptEncoderConfig, and SamMaskDecoderConfig

    >>> # Initializing SAM vision, SAM Q-Former and language model configurations
    >>> vision_config = SamVisionConfig()
    >>> prompt_encoder_config = SamPromptEncoderConfig()
    >>> mask_decoder_config = SamMaskDecoderConfig()

    >>> config = SamConfig(vision_config, prompt_encoder_config, mask_decoder_config)
    ```sam)r   r*   r9   N{Gz?c                     t                      j        di | ||ni }||ni }||ni }t          |t                    r|                                }t          |t
                    r|                                }t          |t                    r|                                }t          di || _        t          di || _        t          di || _	        || _
        d S r   )r   r   
isinstancer8   to_dictr   r)   r9   r   r*   rD   )r   r9   r   r*   rD   r   r   s         r   r   zSamConfig.__init__7  s    	""6""")6)B9N9Z 5 5`b5H5T11Z\m_55 	4)1133M+-CDD 	D$9$A$A$C$C!)+?@@ 	@"5"="="?"?,==}==%;%T%T>S%T%T"#7#N#N:M#N#N !2r   )NNNrQ   )r!   r"   r#   r$   rM   r   r)   r8   sub_configsr   r&   r'   s   @r   rO   rO      sr        / /b J!73( K " 3 3 3 3 3 3 3 3 3 3r   rO   )rO   r)   r   r8   N)r$   configuration_utilsr   utilsr   
get_loggerr!   loggerr   r)   r8   rO   __all__r   r   r   <module>r[      s     3 3 3 3 3 3       
	H	%	%.- .- .- .- .-- .- .- .-b<- <- <- <- <-+ <- <- <-~sT sT sT sT sT& sT sT sTlP3 P3 P3 P3 P3  P3 P3 P3f ]
\
\r   