
     `ij                        d dl mZmZmZ d dlZd dlmZ ddlmZ ddl	m
Z
 ddlmZ ddlmZmZ ddlmZmZ dd	lmZ dd
lmZmZmZ ddlmZ  G d dej                  Z G d dej                  Z G d dej                  Z	 d5dej        dej        dej        dej        deej                 de de dee         fdZ!d Z"dej        de#d ej        fd!Z$d"ej        d#ej        d$ej        d%ej        d e%ej        ej        f         f
d&Z& G d' d(ej                  Z' G d) d*e          Z( G d+ d,ej                  Z) G d- d.ej                  Z*e G d/ d0e                      Z+ ed12           G d3 d4e+                      Z,d0d4gZ-dS )6    )CallableOptionalUnionN   )ACT2FN)FlashAttentionKwargs)GradientCheckpointingLayer)BaseModelOutputBaseModelOutputWithPooling)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)TransformersKwargsauto_docstring	torch_int   )MLCDVisionConfigc                   B     e Zd Z fdZdej        dej        fdZ xZS )MLCDMLPc                    t                                                       || _        t          |j                 | _        t          j        |j        |j	                  | _
        t          j        |j	        |j                  | _        d S N)super__init__configr   
hidden_actactivation_fnnnLinearhidden_sizeintermediate_sizefc1fc2selfr   	__class__s     z/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/transformers/models/mlcd/modeling_mlcd.pyr   zMLCDMLP.__init__%   sf    #F$569V/1IJJ9V5v7IJJ    hidden_statesreturnc                     |                      |          }|                     |          }|                     |          }|S r   )r!   r   r"   )r$   r(   s     r&   forwardzMLCDMLP.forward,   s=    //**=99//r'   )__name__
__module____qualname__r   torchTensorr+   __classcell__r%   s   @r&   r   r   $   sc        K K K K KU\ el        r'   r   c                   b     e Zd ZU ej        ed<   ddededdf fdZded	edej        fd
Z	 xZ
S )MLCDRotaryEmbeddinginv_freq     @dimthetar)   Nc                     t                                                       d|t          j        d|dt          j                  |z  z  z  }|                     d|d           d S )N      ?r      dtyper5   F
persistent)r   r   r/   arangefloatregister_buffer)r$   r7   r8   r5   r%   s       r&   r   zMLCDRotaryEmbedding.__init__6   sd    %ELC%+$N$N$NQT$TUVZeDDDDDr'   num_patches_heightnum_patches_widthc                 z   t          j        || j        j                                      d                              d|          }t          j        || j        j                                      d                              |d          }t          j        |                                |                                gd          }t          ||          }t          j        || j        j        | j        j	                  }t          j
        || j                  }||                             d          }	|	S )a}  
        Calculate the Rotary Position Embedding (RoPE) for MLCDVisionModel based on the grid size.

        Args:
            num_patches_height (int): Number of patches in the height dimension.
            num_patches_width (int): Number of patches in the width dimension.

        Returns:
            torch.Tensor: Rotary positional embeddings for the given grid size.
        )devicer   r   r7   )rF   r=   )r/   r@   r5   rF   	unsqueezeexpandstackflattenmaxr=   outer)
r$   rC   rD   hpos_idswpos_idspos_idsmax_grid_sizeseqrotary_pos_emb_fullrotary_pos_embs
             r&   r+   zMLCDRotaryEmbedding.forward;   s    L+DM4HIIISSTUVV]]^`bstt 	 L*4=3GHHHRRSTUU\\]oqstt 	
 +x//1183C3C3E3EFBOOO .0ABBl=1ET]M`aaa#k#t}== -W5==a@@r'   )r6   )r,   r-   r.   r/   r0   __annotations__intrA   r   r+   r1   r2   s   @r&   r4   r4   3   s         lE EC E ED E E E E E E
# # %,        r'   r4   c                   t     e Zd Zdef fdZdej        dededej        fdZdej	        dej        fd	Z
 xZS )
MLCDVisionEmbeddingsr   c                 2   t                                                       || _        |j        | _        |j        | _        |j        | _        t          j        t          j
        | j                            | _        t          j        |j        | j        | j        | j        d          | _        | j        | j        z  dz  | _        | j        dz   | _        |                     dt          j        | j                                      d          d           d S )NF)in_channelsout_channelskernel_sizestridebiasr;   r   position_ids)r   rG   r>   )r   r   r   r   	embed_dim
image_size
patch_sizer   	Parameterr/   randnclass_embeddingConv2dnum_channelspatch_embeddingnum_patchesnum_positionsrB   r@   rJ   r#   s     r&   r   zMLCDVisionEmbeddings.__init__]   s    + + +!|EK,G,GHH!y+? 
  
  
 !Ot>1D!-1^U\$:L-M-M-T-TU\-]-]jopppppr'   
embeddingsheightwidthr)   c                    |j         d         dz
  }| j        j                            d          }|j         d         dz
  }t          j                                        s&||k    r ||k    r|                     | j                  S |ddddf         }|ddddf         }|j         d         }	|| j        z  }
|| j        z  }t          |dz            }|
                    d|||	          }|                    dddd          }t          j                            ||
|fdd	
          }|                    dddd                              dd|	          }t	          j        ||fd          S )a   
        This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher resolution
        images. This method is also adapted to support torch.jit tracing.

        Adapted from:
        - https://github.com/facebookresearch/dino/blob/de9ee3df6cf39fac952ab558447af1fa1365362a/vision_transformer.py#L174-L194, and
        - https://github.com/facebookresearch/dinov2/blob/e1277af2ba9496fbadf7aec6eba56e8d882d1e35/dinov2/models/vision_transformer.py#L179-L211
        r   r   NrG   g      ?r   r;   bicubicF)sizemodealign_cornersrH   )shapeposition_embeddingweightrI   r/   jit
is_tracingr`   rc   r   reshapepermuter   
functionalinterpolateviewcat)r$   rl   rm   rn   rj   ru   rk   class_pos_embedpatch_pos_embedr7   
new_height	new_widthsqrt_num_positionss                r&   interpolate_pos_encodingz-MLCDVisionEmbeddings.interpolate_pos_encodingr   s    !&q)A-!4;EEaHH*03a7 y##%% 	>+*F*F6UZ??**4+<===,QQQU3,QQQU3r"t.
T_,	&}c'9::)11!5GI[]`aa)11!Q1==-33i(	 4 
 
 *11!Q1==BB1b#NNy/?;CCCCr'   pixel_valuesc                 N   |j         d         }| j        j        j        }|                     |                    |                    }|                    d                              dd          }| j                            |dd          }t          j
        ||gd          }|S )Nr   r<   r;   r   rG   rH   )rt   ri   rv   r=   torL   	transposerf   rJ   r/   r~   )r$   r   
batch_sizetarget_dtypepatch_embedsclass_embedsrl   s          r&   r+   zMLCDVisionEmbeddings.forward   s    !'*
+28++LOO,O,O,OPP#++A..88A>>+22:q"EEYl;CCC
r'   )r,   r-   r.   r   r   r/   r0   rW   r   FloatTensorr+   r1   r2   s   @r&   rY   rY   \   s        q/ q q q q q q*'D5< 'D 'DUX 'D]b]i 'D 'D 'D 'DR
E$5 
%, 
 
 
 
 
 
 
 
r'   rY           modulequerykeyvalueattention_maskscalingdropoutkwargsc                 R   t          || j                  }t          || j                  }	t          j        ||                    dd                    |z  }
|$|d d d d d d d |j        d         f         }|
|z   }
t          j                            |
dt          j	                  
                    |j                  }
t          j                            |
|| j                  }
t          j        |
|	          }|                    dd                                          }||
fS )Nr;   r   rG   )r7   r=   )ptrainingr   )	repeat_kvnum_key_value_groupsr/   matmulr   rt   r   r{   softmaxfloat32r   r=   r   r   
contiguous)r   r   r   r   r   r   r   r   
key_statesvalue_statesattn_weightscausal_maskattn_outputs                r&   eager_attention_forwardr      s    3 ;<<JUF$?@@L<z';';Aq'A'ABBWLL!$QQQ111.D
0@0D.D%DE#k1=((2U](SSVVW\WbccL=((6?([[L,|\::K''1--88::K$$r'   c                     | dd| j         d         dz  f         }| d| j         d         dz  df         }t          j        | |fd          S )z*Rotates half the hidden dims of the input..NrG   r;   rH   )rt   r/   r~   )xx1x2s      r&   rotate_halfr      s]    	
3"!'"+"""	#B	
3q """	#B9rc2YB''''r'   r(   n_repr)   c                     | j         \  }}}}|dk    r| S | dddddddddf                             |||||          } |                     |||z  ||          S )z
    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
    r   N)rt   rJ   ry   )r(   r   batchnum_key_value_headsslenhead_dims         r&   r   r      s    
 2?1D.Ehzz!!!!QQQaaa"23::5BUW\^bdlmmM  (;e(CT8TTTr'   qkcossinc                    | j         }|j         }|                                 |                                }} |                    d                                          |                    d                                          }}| |z  t          |           |z  z   }||z  t          |          |z  z   }|                    |          }|                    |          }||fS )Nr   )r=   rA   rI   r   r   )r   r   r   r   orig_q_dtypeorig_k_dtypeq_embedk_embeds           r&   apply_rotary_pos_emb_visionr      s     7L7L7799aggiiqA}}R  &&((#--*;*;*A*A*C*CC3w;q>>C/0G3w;q>>C/0Gjj&&Gjj&&GGr'   c                        e Zd ZdZdef fdZ	 ddej        deej        ej        f         de	ej                 de
e         d	eej        e	ej                 f         f
d
Z xZS )MLCDAttentionzMulti-headed attention with RoPE. Refer to papers:
    - Attention is all you need:
        https://huggingface.co/papers/1706.03762
    - RoFormer: Enhanced Transformer with Rotary Position Embedding:
        https://huggingface.co/papers/2104.09864
    r   c                    t                                                       || _        |j        | _        |j        | _        | j        | j        z  | _        | j        | j        z  | j        k    r t          d| j         d| j         d          | j        dz  | _	        |j
        | _        d| _        t          j        | j        | j                  | _        t          j        | j        | j                  | _        t          j        | j        | j                  | _        t          j        | j        | j                  | _        |j        | _        d S )Nz;embed_dim must be divisible by num_heads (got `embed_dim`: z and `num_heads`: z).      F)r   r   r   r   ra   num_attention_heads	num_headsr   
ValueErrorscaleattention_dropoutr   	is_causalr   r   k_projv_projq_projout_projr   r#   s     r&   r   zMLCDAttention.__init__   s!   +3$.8=4>)T^;;'dn ' 'N' ' '   ]D(
/i??i??i??	$.$.AA$*$?!!!r'   Nr(   position_embeddingsr   r   r)   c                    |j         dd         \  }}|                     |                              ||| j        | j        f          }|                     |                              ||| j        | j        f          }|                     |                              ||| j        | j        f          }	|d                             d                                          }
|d                             d                                          }t          |||
|          \  }}|
                    dddd                                          }|
                    dddd                                          }|	
                    dddd                                          }	t          }| j        j        dk    rt          | j        j                 } || |||	|f| j        sdn| j        | j        | j        d	|\  }}|
                    dddd                                          }|                    ||d          }|                     |          }|
                    ddd                                          }||fS )
z#Input shape: Batch x Time x ChannelNrG   r   r   r;   r   eagerr   )r   r   r   )rt   r   ry   r   r   r   r   rI   rA   r   rz   r   r   r   _attn_implementationr   r   r   r   r   r}   r   )r$   r(   r   r   r   r   
seq_lengthquery_statesr   r   r   r   attention_interfacer   r   s                  r&   r+   zMLCDAttention.forward   sz    "/!4SbS!9
J {{=1199:zSWSacgcp:qrr[[//77ZQUQ_aean8opp
{{=1199:zSWSacgcp:qrr "!$..q117799!!$..q117799#>|ZY\^a#b#b j $++Aq!Q77BBDD''1a33>>@@
#++Aq!Q77BBDD(?;+w66"9$+:Z"[$7$7
%
  $}>CC$,Jn
%
 
%
 
%
 
%
!\ "))!Q155@@BB!&&z:rBBmmK00!))!Q22==??L((r'   r   )r,   r-   r.   __doc__r   r   r/   r0   tupler   r   r   r+   r1   r2   s   @r&   r   r      s         @/ @ @ @ @ @ @2 26	-) -)|-) #5<#=>-) !.	-)
 -.-) 
u|Xel33	4-) -) -) -) -) -) -) -)r'   r   c                        e Zd Zdef fdZ	 	 ddej        deej        ej        f         deej                 dee	         d	eej
                 f
d
Z xZS )MLCDEncoderLayerr   c                 D   t                                                       |j        | _        t	          |          | _        t          j        | j        |j                  | _	        t          |          | _        t          j        | j        |j                  | _        d S )Neps)r   r   r   ra   r   	self_attnr   	LayerNormlayer_norm_epslayer_norm1r   mlplayer_norm2r#   s     r&   r   zMLCDEncoderLayer.__init__1  s}    +&v..<F<QRRR6??<F<QRRRr'   NFr(   r   r   output_attentionsr)   c                     |}|                      |          }|                     ||||          \  }}||z   }|}|                     |          }|                     |          }||z   }|f}|r||fz  }|S )a  
        Args:
            hidden_states (`torch.FloatTensor`):
                Input to the layer of shape `(batch, seq_len, embed_dim)`.
                Represents the hidden states from the previous layer or the input embeddings.
            position_embeddings (`tuple[torch.Tensor, torch.Tensor]`):
                A tuple of two tensors, each of shape `(batch, seq_len, embed_dim)`.
                Represents absolute positional embeddings for the query and key in the attention mechanism.
            attention_mask (`torch.FloatTensor`):
                Attention mask of shape `(batch, 1, q_len, k_v_seq_len)` where padding elements are indicated by very large negative values.
            output_attentions (`bool`, *optional*, defaults to `False`):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
        r(   r   r   r   )r   r   r   r   )r$   r(   r   r   r   residualr   outputss           r&   r+   zMLCDEncoderLayer.forward9  s    * !((77&*nn' 3)/	 '5 '
 '
#| !=0 ((77// =0 " 	'&Gr'   )NF)r,   r-   r.   r   r   r/   r0   r   r   boolr   r+   r1   r2   s   @r&   r   r   0  s        S/ S S S S S S 26,1* *|* #5<#=>* !.	*
 $D>* 
u 	!* * * * * * * *r'   r   c                        e Zd ZdZdef fdZ	 	 	 	 ddej        deej	        ej	        f         de
ej	                 de
e         d	e
e         d
e
e         deeef         fdZ xZS )MLCDEncoderz
    Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a
    [`MLCDEncoderLayer`].

    Args:
        config: MLCDVisionConfig
    r   c                     t                                                       | _        t          j        fdt          j                  D                       | _        d| _        dS )z3Overwrite dummy `MLCDConfig` to `MLCDVisionConfig`.c                 .    g | ]}t                    S  )r   ).0_r   s     r&   
<listcomp>z(MLCDEncoder.__init__.<locals>.<listcomp>s  s"    $g$g$g!%5f%=%=$g$g$gr'   FN)	r   r   r   r   
ModuleListrangenum_hidden_layerslayersgradient_checkpointingr#   s    `r&   r   zMLCDEncoder.__init__o  s`    m$g$g$g$guVMeGfGf$g$g$ghh&+###r'   Ninputs_embedsr   r   r   output_hidden_statesreturn_dictr)   c                 |   ||n| j         j        }||n| j         j        }||n| j         j        }|rdnd}|rdnd}|}	t	          | j                  D ]2\  }
}|r||	fz   } ||	|||          }|d         }	|r||d         fz   }3|r||	fz   }|st          d |	||fD                       S t          |	||          S )aj  
        Args:
            inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
                Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
                This is useful if you want more control over how to convert `input_ids` indices into associated vectors
                than the model's internal embedding lookup matrix.
            position_embeddings (`tuple[torch.Tensor, torch.Tensor]`):
                A tuple of two tensors, each of shape `(batch, seq_len, embed_dim)`.
                Represents absolute positional embeddings for the query and key in the attention mechanism.
            attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
                - 1 for tokens that are **not masked**,
                - 0 for tokens that are **masked**.
                [What are attention masks?](../glossary#attention-mask)
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
            output_hidden_states (`bool`, *optional*):
                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
                for more detail.
            return_dict (`bool`, *optional*):
                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
        Nr   r   r   r   c              3      K   | ]}||V  	d S r   r   )r   vs     r&   	<genexpr>z&MLCDEncoder.forward.<locals>.<genexpr>  s(      eeqWXWdWdWdWdWdeer'   )last_hidden_stater(   
attentions)r   r   use_return_dictr   	enumerater   r   r
   )r$   r   r   r   r   r   r   encoder_statesall_attentionsr(   idxencoder_layerlayer_outputss                r&   r+   zMLCDEncoder.forwardv  sM   D %9$D  $+Jj 	 &1%<kk$+B]1B1N--TXT_Tq3=0:d%"+DK"8"8 	F 	FC# C!/=2B!B)M+$7-"3	  M *!,M  F!/=3C2E!E 	?+}.>>N 	fee]NN$Seeeeee+(%
 
 
 	
r'   NNNN)r,   r-   r.   r   r   r   r/   r   r   r0   r   r   r   r
   r+   r1   r2   s   @r&   r   r   f  s         ,/ , , , , , , 26,0/3&*C
 C
(C
 #5<#=>C
 !.	C

 $D>C
 'tnC
 d^C
 
uo%	&C
 C
 C
 C
 C
 C
 C
 C
r'   r   c                        e Zd Zdef fdZe	 	 	 	 d
deej                 dee	         dee	         dee	         de
eef         f
d	            Z xZS )MLCDVisionTransformerr   c                    t                                                       || _        |j        }t	          |          | _        t          j        ||j                  | _	        t          |          | _        t          j        ||j                  | _        t          |j        |j        z  dz            | _        t          j        t#          j        d|j        |j        z  dz                      | _        d S )Nr   r;   r   )r   r   r   r   rY   rl   r   r   r   pre_layrnormr   encoderpost_layernormr4   r   vision_rotary_embeddingrd   r/   re   class_pos_emb)r$   r   ra   r%   s      r&   r   zMLCDVisionTransformer.__init__  s    &	.v66L8MNNN"6** l9&:OPPP':6;MQWQk;kop;p'q'q$\%+a9KvOi9imn9n*o*oppr'   Nr   r   r   r   r)   c                 L   ||n| j         j        }||n| j         j        }||n| j         j        }|t	          d          |j        d         | j         j        z  }|j        d         | j         j        z  }|                     ||          }|                    | j	        j
                  }t          j        | j	        |gd          }t          j        ||fd          }|                                |                                f}	|                     |          }
|                     |
          }
|                     |
|	|||          }|d         }|d d dd d f         }|                     |          }|s||f|dd          z   S t'          |||j        |j                  S )	Nz You have to specify pixel_valuesr   rG   r   rH   )r   r   r   r   r   r   )r   pooler_outputr(   r   )r   r   r   r   r   rt   rc   r  r   r	  rF   r/   r~   r   r   rl   r  r  r  r   r(   r   )r$   r   r   r   r   rC   rD   rU   embr   r(   encoder_outputsr   pooled_outputs                 r&   r+   zMLCDVisionTransformer.forward  s    %9$D  $+Jj 	 &1%<kk$+B]1B1N--TXT_Tq?@@@)/3t{7MM(.r2dk6LL556HJ[\\'**4+=+DEED$6#GQOOOi8bAAA"wwyy#''))455))-88,,' 3/!5# ' 
 
 ,A.)!!!Q'2++M:: 	L%}58KKK)/')7&1	
 
 
 	
r'   r  )r,   r-   r.   r   r   r   r   r/   r   r   r   r   r   r+   r1   r2   s   @r&   r  r    s        
q/ 
q 
q 
q 
q 
q 
q  59,0/3&*/
 /
u01/
 $D>/
 'tn	/

 d^/
 
u00	1/
 /
 /
 ^/
 /
 /
 /
 /
r'   r  c                   0    e Zd ZU eed<   dZdZdZdZd Z	dS )MLCDPreTrainedModelr   mlcdTc                    | j         j        }t          |t                    rx| j         j        }t          j                            |j        d|j        dz  |z             t          j                            |j	        j
        |j         j        |z             dS t          |t                    r| j         j        }|j        dz  d|j         j        z  dz  z  |z  }|j        dz  |z  }t          j                            |j        j
        |           t          j                            |j        j
        |           t          j                            |j        j
        |           t          j                            |j        j
        |           dS t          |t$                    r| j         j        }|j         j        dz  d|j         j        z  dz  z  |z  }d|j         j        z  dz  |z  }t          j                            |j        j
        |           t          j                            |j        j
        |           dS t          |t,                    rW| j         j        }|j         j        |j         j        z  dz  dz  |z  }t          j                            |j        d|           dS t          |t          j                  r?|j        j                                         |j
        j                            d           dS t          |t          j                  r'|j        "|j        j                                         dS dS dS )zInitialize the weightsr   r   )meanstd)r  r;   r:   N)r   initializer_factor
isinstancerY   r   initnormal_rf   ra   ri   rv   initializer_ranger   r   r   r   r   r   r   r   r!   r"   r  r   r	  r   r_   datazero_fill_r   )r$   r   factorin_proj_stdout_proj_stdfc_stdpos_emb_stds          r&   _init_weightsz!MLCDPreTrainedModel._init_weights  s   /f233 	%[3FGOOF2&BRTXBX[aBaObbbGOOF29v}?^ag?gOhhhhh.. 	%[3F!+T1q6=;Z7Z_c6cdgmmK",d2f<LGOOFM0kOBBBGOOFM0kOBBBGOOFM0kOBBBGOOFO2OEEEEE(( 	%[3F!=4d:FMDc@chl?lmpvvK&-33<vEFGOOFJ-6O:::GOOFJ-;O????? 566 	%[3F!=48YY]^^cggjppKGOOF0sOLLLLL-- 	%K""$$$M$$S)))))	** 	%v{/FK""$$$$$	% 	%/F/Fr'   N)
r,   r-   r.   r   rV   base_model_prefixsupports_gradient_checkpointing_supports_flash_attn_supports_sdpar"  r   r'   r&   r  r    sH         &*#N% % % % %r'   r  zN
    The vision model from M_L_C_D without any head or projection on top.
    )custom_introc                        e Zd ZU eed<   dZdgZdef fdZdej	        fdZ
e	 	 	 	 ddeej                 dee         d	ee         d
ee         deeef         f
d            Z xZS )MLCDVisionModelr   r   r   c                     t                                          |           t          |          | _        |                                  d S r   )r   r   r  vision_model	post_initr#   s     r&   r   zMLCDVisionModel.__init__.  sA       1&99r'   r)   c                 $    | j         j        j        S r   )r+  rl   ri   )r$   s    r&   get_input_embeddingsz$MLCDVisionModel.get_input_embeddings4  s     +;;r'   Nr   r   r   c                     ||n| j         j        }||n| j         j        }||n| j         j        }|                     ||||          S )a  
        Example:

        ```python
        >>> import requests
        >>> from PIL import Image
        >>> from transformers import AutoProcessor, MLCDVisionModel
        >>> model = MLCDVisionModel.from_pretrained("DeepGlint-AI/mlcd-vit-bigG-patch14-448")
        >>> processor = AutoProcessor.from_pretrained("DeepGlint-AI/mlcd-vit-bigG-patch14-448")

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)
        >>> inputs = processor(images=image, return_tensors="pt")

        >>> with torch.no_grad():
        ...     outputs = model(**inputs, output_attentions=True)

        >>> features = outputs.last_hidden_state
        >>> print(f"Extracted features shape: {features.shape}")
        >>> print(f"Number of attention layers: {len(outputs.attentions)}")
        >>> print(f"Attention shape: {outputs.attentions[0].shape}")
        ```N)r   r   r   r   )r   r   r   r   r+  )r$   r   r   r   r   s        r&   r+   zMLCDVisionModel.forward7  sr    > %9$D  $+Jj 	 &1%<kk$+B]1B1N--TXT_Tq  %/!5#	 ! 
 
 	
r'   r  )r,   r-   r.   r   rV   main_input_name_no_split_modulesr   r   Moduler.  r   r   r/   r   r   r   r   r   r+   r1   r2   s   @r&   r)  r)  $  s         $O+,/      <bi < < < <  59,0/3&*(
 (
u01(
 $D>(
 'tn	(

 d^(
 
u00	1(
 (
 (
 ^(
 (
 (
 (
 (
r'   r)  )r   ).typingr   r   r   r/   torch.nnr   activationsr   modeling_flash_attention_utilsr   modeling_layersr	   modeling_outputsr
   r   modeling_utilsr   r   processing_utilsr   utilsr   r   r   configuration_mlcdr   r2  r   r4   rY   r0   rA   r   r   rW   r   r   r   r   r   r   r  r  r)  __all__r   r'   r&   <module>r>     s  * - , , , , , , , , ,        ! ! ! ! ! ! B B B B B B 9 9 9 9 9 9 K K K K K K K K F F F F F F F F & & & & & & B B B B B B B B B B 0 0 0 0 0 0    bi   & & & & &") & & &RI I I I I29 I I If % %I%<% 
% <	%
 U\*% % % '(% % % %4( ( (	UU\ 	U# 	U%, 	U 	U 	U 	U|+0<>Cl
5<%&   J) J) J) J) J)BI J) J) J)Z3 3 3 3 31 3 3 3lS
 S
 S
 S
 S
") S
 S
 S
l=
 =
 =
 =
 =
BI =
 =
 =
@ $% $% $% $% $%/ $% $% $%N   
7
 7
 7
 7
 7
) 7
 7
 
7
t !"3
4r'   