
     `ia[                        d dl mZmZmZ d dlZd dlmZ ddlmZ ddl	m
Z
 ddlmZmZ ddlmZmZ ddlmZ dd	lmZmZ d
dlmZmZmZmZmZmZmZ d
dlmZ d
dl m!Z!m"Z"  ej#        e$          Z% G d de          Z& G d de          Z' G d de!          Z( G d de          Z) G d de          Z* G d de          Z+ G d de          Z, G d de          Z-e G d de                      Z. G d  d!e          Z/g d"Z0dS )#    )CallableOptionalUnionN   )PretrainedConfig)FlashAttentionKwargs)BaseModelOutputBaseModelOutputWithPooling)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)auto_docstringlogging   )CLIPMLPCLIPAttentionCLIPEncoderCLIPEncoderLayerCLIPVisionEmbeddingsCLIPVisionModelCLIPVisionTransformer)eager_attention_forward)VisionRotaryEmbeddingapply_rotary_pos_emb_visionc                   F     e Zd ZdZdZdZ	 	 	 	 	 	 	 	 	 	 	 	 	 d fd	Z xZS )MLCDVisionConfiga  
    This is the configuration class to store the configuration of a [`MLCDVisionModel`]. It is used to instantiate a MLCD
    vision encoder according to the specified arguments, defining the model architecture. Instantiating a configuration
    with the defaults will yield a similar configuration to that of the vision encoder of the MLCD
    [DeepGlint-AI/mlcd-vit-bigG-patch14-336](https://huggingface.co/DeepGlint-AI/mlcd-vit-bigG-patch14-336) architecture.

    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PretrainedConfig`] for more information.

    Args:
        hidden_size (`int`, *optional*, defaults to 1664):
            Dimensionality of the encoder layers and the pooler layer.
        intermediate_size (`int`, *optional*, defaults to 8192):
            Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
        projection_dim (`int`, *optional*, defaults to 1024):
            Dimensionality of text and vision projection layers.
        num_hidden_layers (`int`, *optional*, defaults to 48):
            Number of hidden layers in the Transformer encoder.
        num_attention_heads (`int`, *optional*, defaults to 16):
            Number of attention heads for each attention layer in the Transformer encoder.
        num_channels (`int`, *optional*, defaults to 3):
            The number of input channels.
        image_size (`int`, *optional*, defaults to 336):
            The size (resolution) of each image.
        patch_size (`int`, *optional*, defaults to 14):
            The size (resolution) of each patch.
        hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
            `"relu"`, `"selu"` and `"gelu_new"` `"quick_gelu"` are supported.
        layer_norm_eps (`float`, *optional*, defaults to 1e-05):
            The epsilon used by the layer normalization layers.
        attention_dropout (`float`, *optional*, defaults to 0.0):
            The dropout ratio for the attention probabilities.
        initializer_range (`float`, *optional*, defaults to 0.02):
            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
        initializer_factor (`float`, *optional*, defaults to 1.0):
            A factor for initializing all weight matrices (should be kept to 1, used internally for initialization
            testing).

    Example:

    ```python
    >>> from transformers import MLCDVisionConfig, MLCDVisionModel

    >>> # Initializing a MLCDVisionConfig with DeepGlint-AI/mlcd-vit-bigG-patch14-336 style configuration
    >>> configuration = MLCDVisionConfig()

    >>> # Initializing a MLCDVisionModel (with random weights) from the DeepGlint-AI/mlcd-vit-bigG-patch14-336 style configuration
    >>> model = MLCDVisionModel(configuration)

    >>> # Accessing the model configuration
    >>> configuration = model.config
    ```mlcd_vision_modelvision_config      0         r   P     geluh㈵>        {Gz?      ?c                      t                      j        di | || _        || _        || _        || _        || _        || _        || _        || _	        || _
        || _        || _        |
| _        |	| _        d S )N )super__init__hidden_sizeintermediate_sizenum_hidden_layersnum_attention_headsnum_key_value_groupsnum_channels
patch_size
image_sizeinitializer_rangeinitializer_factorattention_dropoutlayer_norm_eps
hidden_act)selfr/   r0   r1   r2   r3   r4   r6   r5   r;   r:   r9   r7   r8   kwargs	__class__s                  y/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/transformers/models/mlcd/modular_mlcd.pyr.   zMLCDVisionConfig.__init__d   s    " 	""6"""&!2!2#6 $8!($$!2"4!2,$    )r   r    r!   r"   r#   r   r$   r%   r&   r'   r(   r)   r*   )__name__
__module____qualname____doc__
model_typebase_config_keyr.   __classcell__r>   s   @r?   r   r   *   s}        4 4l %J%O % % % % % % % % % %r@   r   c                       e Zd ZdS )MLCDMLPN)rA   rB   rC   r,   r@   r?   rJ   rJ      s        Dr@   rJ   c                   ,    e Zd Zdededej        fdZdS )MLCDRotaryEmbeddingnum_patches_heightnum_patches_widthreturnc                 z   t          j        || j        j                                      d                              d|          }t          j        || j        j                                      d                              |d          }t          j        |                                |                                gd          }t          ||          }t          j        || j        j        | j        j	                  }t          j
        || j                  }||                             d          }	|	S )a}  
        Calculate the Rotary Position Embedding (RoPE) for MLCDVisionModel based on the grid size.

        Args:
            num_patches_height (int): Number of patches in the height dimension.
            num_patches_width (int): Number of patches in the width dimension.

        Returns:
            torch.Tensor: Rotary positional embeddings for the given grid size.
        )devicer#   r   dim)rQ   dtype)torcharangeinv_freqrQ   	unsqueezeexpandstackflattenmaxrU   outer)
r<   rM   rN   hpos_idswpos_idspos_idsmax_grid_sizeseqrotary_pos_emb_fullrotary_pos_embs
             r?   forwardzMLCDRotaryEmbedding.forward   s    L+DM4HIIISSTUVV]]^`bstt 	 L*4=3GHHHRRSTUU\\]oqstt 	
 +x//1183C3C3E3EFBOOO .0ABBl=1ET]M`aaa#k#t}== -W5==a@@r@   N)rA   rB   rC   intrV   Tensorrf   r,   r@   r?   rL   rL      s?        # # %,      r@   rL   c                   H     e Zd Zdef fdZdej        dej        fdZ xZ	S )MLCDVisionEmbeddingsconfigc                 N    t                                          |           | `d S N)r-   r.   position_embeddingr<   rk   r>   s     r?   r.   zMLCDVisionEmbeddings.__init__   s'       ###r@   pixel_valuesrO   c                 N   |j         d         }| j        j        j        }|                     |                    |                    }|                    d                              dd          }| j                            |dd          }t          j
        ||gd          }|S )Nr   )rU   r   r#   rR   rS   )shapepatch_embeddingweightrU   tor\   	transposeclass_embeddingrZ   rV   cat)r<   rp   
batch_sizetarget_dtypepatch_embedsclass_embeds
embeddingss          r?   rf   zMLCDVisionEmbeddings.forward   s    !'*
+28++LOO,O,O,OPP#++A..88A>>+22:q"EEYl;CCC
r@   )
rA   rB   rC   r   r.   rV   FloatTensorrh   rf   rG   rH   s   @r?   rj   rj      sl        $/ $ $ $ $ $ $
E$5 
%, 
 
 
 
 
 
 
 
r@   rj   c                        e Zd ZdZdef fdZ	 ddej        deej        ej        f         de	ej                 de
e         d	eej        e	ej                 f         f
d
Z xZS )MLCDAttentionzMulti-headed attention with RoPE. Refer to papers:
    - Attention is all you need:
        https://huggingface.co/papers/1706.03762
    - RoFormer: Enhanced Transformer with Rotary Position Embedding:
        https://huggingface.co/papers/2104.09864
    rk   c                 p    t                                          |           |j        | _        d| _        d S NF)r-   r.   r3   	is_causalro   s     r?   r.   zMLCDAttention.__init__   s1       $*$?!r@   Nhidden_statesposition_embeddingsattention_maskr=   rO   c                    |j         d d         \  }}|                     |                              ||| j        | j        f          }|                     |                              ||| j        | j        f          }|                     |                              ||| j        | j        f          }	|d                             d                                          }
|d                             d                                          }t          |||
|          \  }}|
                    dddd                                          }|
                    dddd                                          }|	
                    dddd                                          }	t          }| j        j        dk    rt          | j        j                 } || |||	|f| j        sdn| j        | j        | j        d|\  }}|
                    dddd                                          }|                    ||d          }|                     |          }|
                    ddd                                          }||fS )	NrR   r   r#   r   r   eagerr(   )dropoutscalingr   )rr   q_projreshape	num_headshead_dimk_projv_projrY   floatr   permute
contiguousr   rk   _attn_implementationr   trainingr   scaler   viewout_proj)r<   r   r   r   r=   ry   
seq_lengthquery_states
key_statesvalue_statescossinattention_interfaceattn_outputattn_weightss                  r?   rf   zMLCDAttention.forward   sz    "/!4SbS!9
J {{=1199:zSWSacgcp:qrr[[//77ZQUQ_aean8opp
{{=1199:zSWSacgcp:qrr "!$..q117799!!$..q117799#>|ZY\^a#b#b j $++Aq!Q77BBDD''1a33>>@@
#++Aq!Q77BBDD(?;+w66"9$+:Z"[$7$7
%
  $}>CC$,Jn
%
 
%
 
%
 
%
!\ "))!Q155@@BB!&&z:rBBmmK00!))!Q22==??L((r@   rm   )rA   rB   rC   rD   r   r.   rV   rh   tupler   r   r   rf   rG   rH   s   @r?   r   r      s         /       26	,) ,)|,) #5<#=>,) !.	,)
 -.,) 
u|Xel33	4,) ,) ,) ,) ,) ,) ,) ,)r@   r   c                        e Zd Zdef fdZ	 	 ddej        deej        ej        f         deej                 dee	         d	eej
                 f
d
Z xZS )MLCDEncoderLayerrk   c                 r    t                                          |           t          |          | _        d S rm   )r-   r.   r   	self_attnro   s     r?   r.   zMLCDEncoderLayer.__init__   s.       &v..r@   NFr   r   r   output_attentionsrO   c                     |}|                      |          }|                     ||||          \  }}||z   }|}|                     |          }|                     |          }||z   }|f}|r||fz  }|S )a  
        Args:
            hidden_states (`torch.FloatTensor`):
                Input to the layer of shape `(batch, seq_len, embed_dim)`.
                Represents the hidden states from the previous layer or the input embeddings.
            position_embeddings (`tuple[torch.Tensor, torch.Tensor]`):
                A tuple of two tensors, each of shape `(batch, seq_len, embed_dim)`.
                Represents absolute positional embeddings for the query and key in the attention mechanism.
            attention_mask (`torch.FloatTensor`):
                Attention mask of shape `(batch, 1, q_len, k_v_seq_len)` where padding elements are indicated by very large negative values.
            output_attentions (`bool`, *optional*, defaults to `False`):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
        r   r   r   r   )layer_norm1r   layer_norm2mlp)r<   r   r   r   r   residualr   outputss           r?   rf   zMLCDEncoderLayer.forward   s    * !((77&*nn' 3)/	 '5 '
 '
#| !=0 ((77// =0 " 	'&Gr@   r   )rA   rB   rC   r   r.   rV   rh   r   r   boolr~   rf   rG   rH   s   @r?   r   r      s        // / / / / / / 26,1* *|* #5<#=>* !.	*
 $D>* 
u 	!* * * * * * * *r@   r   c                        e Zd ZdZdef fdZ	 	 	 	 ddej        deej	        ej	        f         de
ej	                 de
e         d	e
e         d
e
e         deeef         fdZ xZS )MLCDEncoderz
    Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a
    [`MLCDEncoderLayer`].

    Args:
        config: MLCDVisionConfig
    rk   c                 J    t                                          |           dS )z3Overwrite dummy `MLCDConfig` to `MLCDVisionConfig`.N)r-   r.   ro   s     r?   r.   zMLCDEncoder.__init__5  s!         r@   Ninputs_embedsr   r   r   output_hidden_statesreturn_dictrO   c                 |   ||n| j         j        }||n| j         j        }||n| j         j        }|rdnd}|rdnd}|}	t	          | j                  D ]2\  }
}|r||	fz   } ||	|||          }|d         }	|r||d         fz   }3|r||	fz   }|st          d |	||fD                       S t          |	||          S )aj  
        Args:
            inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
                Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
                This is useful if you want more control over how to convert `input_ids` indices into associated vectors
                than the model's internal embedding lookup matrix.
            position_embeddings (`tuple[torch.Tensor, torch.Tensor]`):
                A tuple of two tensors, each of shape `(batch, seq_len, embed_dim)`.
                Represents absolute positional embeddings for the query and key in the attention mechanism.
            attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
                - 1 for tokens that are **not masked**,
                - 0 for tokens that are **masked**.
                [What are attention masks?](../glossary#attention-mask)
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
            output_hidden_states (`bool`, *optional*):
                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
                for more detail.
            return_dict (`bool`, *optional*):
                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
        Nr,   r   r   r#   c              3      K   | ]}||V  	d S rm   r,   ).0vs     r?   	<genexpr>z&MLCDEncoder.forward.<locals>.<genexpr>w  s(      eeqWXWdWdWdWdWdeer@   )last_hidden_stater   
attentions)rk   r   use_return_dictr   	enumeratelayersr   r	   )r<   r   r   r   r   r   r   encoder_statesall_attentionsr   idxencoder_layerlayer_outputss                r?   rf   zMLCDEncoder.forward9  sM   D %9$D  $+Jj 	 &1%<kk$+B]1B1N--TXT_Tq3=0:d%"+DK"8"8 	F 	FC# C!/=2B!B)M+$7-"3	  M *!,M  F!/=3C2E!E 	?+}.>>N 	fee]NN$Seeeeee+(%
 
 
 	
r@   NNNN)rA   rB   rC   rD   r   r.   rV   r~   r   rh   r   r   r   r	   rf   rG   rH   s   @r?   r   r   ,  s         !/ ! ! ! ! ! ! 26,0/3&*C
 C
(C
 #5<#=>C
 !.	C

 $D>C
 'tnC
 d^C
 
uo%	&C
 C
 C
 C
 C
 C
 C
 C
r@   r   c                        e Zd Zdef fdZe	 	 	 	 d
deej                 dee	         dee	         dee	         de
eef         f
d	            Z xZS )MLCDVisionTransformerrk   c                 
   t                                          |           t          |j        |j        z  dz            | _        t          j        t          j	        d|j        |j        z  dz                      | _
        d S )Nr   r#   )r-   r.   rL   r/   r2   vision_rotary_embeddingnn	ParameterrV   randnclass_pos_embro   s     r?   r.   zMLCDVisionTransformer.__init__  sr       ':6;MQWQk;kop;p'q'q$\%+a9KvOi9imn9n*o*oppr@   Nrp   r   r   r   rO   c                 L   ||n| j         j        }||n| j         j        }||n| j         j        }|t	          d          |j        d         | j         j        z  }|j        d         | j         j        z  }|                     ||          }|                    | j	        j
                  }t          j        | j	        |gd          }t          j        ||fd          }|                                |                                f}	|                     |          }
|                     |
          }
|                     |
|	|||          }|d         }|d d dd d f         }|                     |          }|s||f|dd          z   S t'          |||j        |j                  S )	Nz You have to specify pixel_valuesrR   r   rS   )r   r   r   r   r   r#   )r   pooler_outputr   r   )rk   r   r   r   
ValueErrorrr   r5   r   ru   r   rQ   rV   rx   r   r   r}   pre_layrnormencoderpost_layernormr
   r   r   )r<   rp   r   r   r   rM   rN   re   embr   r   encoder_outputsr   pooled_outputs                 r?   rf   zMLCDVisionTransformer.forward  s    %9$D  $+Jj 	 &1%<kk$+B]1B1N--TXT_Tq?@@@)/3t{7MM(.r2dk6LL556HJ[\\'**4+=+DEED$6#GQOOOi8bAAA"wwyy#''))455))-88,,' 3/!5# ' 
 
 ,A.)!!!Q'2++M:: 	L%}58KKK)/')7&1	
 
 
 	
r@   r   )rA   rB   rC   r   r.   r   r   rV   r~   r   r   r   r
   rf   rG   rH   s   @r?   r   r     s        q/ q q q q q q
  59,0/3&*/
 /
u01/
 $D>/
 'tn	/

 d^/
 
u00	1/
 /
 /
 ^/
 /
 /
 /
 /
r@   r   c                   0    e Zd ZU eed<   dZdZdZdZd Z	dS )MLCDPreTrainedModelrk   mlcdTc                    | j         j        }t          |t                    rx| j         j        }t          j                            |j        d|j        dz  |z             t          j                            |j	        j
        |j         j        |z             dS t          |t                    r| j         j        }|j        dz  d|j         j        z  dz  z  |z  }|j        dz  |z  }t          j                            |j        j
        |           t          j                            |j        j
        |           t          j                            |j        j
        |           t          j                            |j        j
        |           dS t          |t$                    r| j         j        }|j         j        dz  d|j         j        z  dz  z  |z  }d|j         j        z  dz  |z  }t          j                            |j        j
        |           t          j                            |j        j
        |           dS t          |t,                    rW| j         j        }|j         j        |j         j        z  dz  dz  |z  }t          j                            |j        d|           dS t          |t          j                  r?|j        j                                         |j
        j                            d           dS t          |t          j                  r'|j        "|j        j                                         dS dS dS )zInitialize the weightsr(   g      )meanstd)r   r   r*   N)rk   r8   
isinstancerj   r   initnormal_rw   	embed_dimrs   rt   r7   r   r1   r   r   r   r   rJ   r/   fc1fc2r   r2   r   	LayerNormbiasdatazero_fill_Linear)r<   modulefactorin_proj_stdout_proj_stdfc_stdpos_emb_stds          r?   _init_weightsz!MLCDPreTrainedModel._init_weights  s   /f233 	%[3FGOOF2&BRTXBX[aBaObbbGOOF29v}?^ag?gOhhhhh.. 	%[3F!+T1q6=;Z7Z_c6cdgmmK",d2f<LGOOFM0kOBBBGOOFM0kOBBBGOOFM0kOBBBGOOFO2OEEEEE(( 	%[3F!=4d:FMDc@chl?lmpvvK&-33<vEFGOOFJ-6O:::GOOFJ-;O????? 566 	%[3F!=48YY]^^cggjppKGOOF0sOLLLLL-- 	%K""$$$M$$S)))))	** 	%v{/FK""$$$$$	% 	%/F/Fr@   N)
rA   rB   rC   r   __annotations__base_model_prefixsupports_gradient_checkpointing_supports_flash_attn_supports_sdpar   r,   r@   r?   r   r     sH         &*#N% % % % %r@   r   c                       e Zd Ze	 	 	 	 ddeej                 dee         dee         dee         dee	e
f         f
d            ZdS )	MLCDVisionModelNrp   r   r   r   rO   c                     ||n| j         j        }||n| j         j        }||n| j         j        }|                     ||||          S )a  
        Example:

        ```python
        >>> import requests
        >>> from PIL import Image
        >>> from transformers import AutoProcessor, MLCDVisionModel
        >>> model = MLCDVisionModel.from_pretrained("DeepGlint-AI/mlcd-vit-bigG-patch14-448")
        >>> processor = AutoProcessor.from_pretrained("DeepGlint-AI/mlcd-vit-bigG-patch14-448")

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)
        >>> inputs = processor(images=image, return_tensors="pt")

        >>> with torch.no_grad():
        ...     outputs = model(**inputs, output_attentions=True)

        >>> features = outputs.last_hidden_state
        >>> print(f"Extracted features shape: {features.shape}")
        >>> print(f"Number of attention layers: {len(outputs.attentions)}")
        >>> print(f"Attention shape: {outputs.attentions[0].shape}")
        ```N)rp   r   r   r   )rk   r   r   r   vision_model)r<   rp   r   r   r   s        r?   rf   zMLCDVisionModel.forward  sr    > %9$D  $+Jj 	 &1%<kk$+B]1B1N--TXT_Tq  %/!5#	 ! 
 
 	
r@   r   )rA   rB   rC   r   r   rV   r~   r   r   r   r
   rf   r,   r@   r?   r   r     s         59,0/3&*(
 (
u01(
 $D>(
 'tn	(

 d^(
 
u00	1(
 (
 (
 ^(
 (
 (
r@   r   )r   r   r   )1typingr   r   r   rV   torch.nnr   configuration_utilsr   modeling_flash_attention_utilsr   modeling_outputsr	   r
   modeling_utilsr   r   processing_utilsr   utilsr   r   clip.modeling_clipr   r   r   r   r   r   r   llama.modeling_llamar   qwen2_vl.modeling_qwen2_vlr   r   
get_loggerrA   loggerr   rJ   rL   rj   r   r   r   r   r   r   __all__r,   r@   r?   <module>r     s:   - , , , , , , , , ,        3 3 3 3 3 3 B B B B B B K K K K K K K K F F F F F F F F & & & & & & , , , , , , , ,                  ; : : : : : [ [ [ [ [ [ [ [ 
	H	%	%Y% Y% Y% Y% Y%' Y% Y% Y%x	 	 	 	 	g 	 	 	    /   D    /   $9) 9) 9) 9) 9)M 9) 9) 9)x/ / / / /' / / /dP
 P
 P
 P
 P
+ P
 P
 P
f6
 6
 6
 6
 6
1 6
 6
 6
r $% $% $% $% $%/ $% $% $%N*
 *
 *
 *
 *
o *
 *
 *
Z  r@   