
     `i                        d Z ddlmZ ddlmZmZmZmZ ddlZddl	m
c mZ ddlm
Z
 ddlmZ ddlmZmZ dd	lmZ dd
lmZ ddlmZ ddlmZ ddlmZmZmZ ddlmZ ddl m!Z!m"Z"m#Z#m$Z$ ddl%m&Z& ddl'm(Z(m)Z) ddl*m+Z+ ddl,m-Z- ddl.m/Z/m0Z0  e$j1        e2          Z3e e"d           G d de                                  Z4e e"d           G d de                                  Z5	 	 	 	 dBdZ6g fdZ7 G d  d!e
j8                  Z9 G d" d#e
j:                  Z; G d$ d%e
j<                  Z= G d& d'ej
        j<                  Z>d( Z?dCd)Z@ G d* d+e
j<                  ZA	 dDd-e
j<        d.ejB        d/ejB        d0ejB        d1eejB                 d2eCd3eCfd4ZD G d5 d6e
j<                  ZE G d7 d8e          ZF G d9 d:e          ZGe" G d; d<e                      ZHe" G d= d>eH                      ZI G d? d@eHe          ZJg dAZKdS )EzPyTorch Idefics model.    )	dataclass)AnyCallableOptionalUnionN)nn   )ACT2FN)CacheDynamicCache)GenerationMixin)create_causal_mask)GradientCheckpointingLayer)ModelOutput)ALL_ATTENTION_FUNCTIONSPretrainedConfigPreTrainedModel)Unpack)TransformersKwargsauto_docstringcan_return_tuplelogging)deprecate_kwarg)OutputRecordercheck_model_inputs   )IdeficsConfig)IdeficsPerceiverResampler)IdeficsVisionEmbeddingsIdeficsVisionTransformerz{
    Base class for Idefics model's outputs that may also contain a past key/values (to speed up sequential decoding).
    )custom_introc                       e Zd ZU dZdZeej                 ed<   dZ	ee
         ed<   dZeeej                          ed<   dZeeej                          ed<   dZeeej                          ed<   dS )IdeficsBaseModelOutputWithPasta  
    last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
        Sequence of hidden-states at the output of the last layer of the model.

        If `past_key_values` is used only the last hidden-state of the sequences of shape `(batch_size, 1,
        hidden_size)` is output.
    past_key_values (`Cache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
        It is a [`~cache_utils.Cache`] instance. For more details, see our [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache).

        Contains pre-computed hidden-states (key and values in the self-attention blocks and optionally if
        `config.is_encoder_decoder=True` in the cross-attention blocks) that can be used (see `past_key_values`
        input) to speed up sequential decoding.
    image_hidden_states (`tuple(torch.FloatTensor)`, *optional*):
        Tuple of `torch.FloatTensor` (one for the output of the image embeddings, `(batch_size, num_images,
        sequence_length, hidden_size)`.

        image_hidden_states of the model produced by the vision encoder, and optionally by the perceiver
    Nlast_hidden_statepast_key_valueshidden_states
attentionsimage_hidden_states)__name__
__module____qualname____doc__r$   r   torchFloatTensor__annotations__r%   r   r&   tupler'   r(        /home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/transformers/models/idefics/modeling_idefics.pyr#   r#   0   s          & 6:x 12999'+OXe_+++8<M8E%"345<<<59Ju012999>B%(9":;BBBBBr2   r#   zS
    Base class for Idefics causal language model (or autoregressive) outputs.
    c                      e Zd ZU dZdZeej                 ed<   dZ	eej                 ed<   dZ
ee         ed<   dZeeej                          ed<   dZeeej                          ed<   dZeeej                          ed<   dS )	IdeficsCausalLMOutputWithPastae  
    loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
        Language modeling loss (for next-token prediction).
    logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
        Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
    past_key_values (`Cache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
        It is a [`~cache_utils.Cache`] instance. For more details, see our [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache).

        Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
        `past_key_values` input) to speed up sequential decoding.
    image_hidden_states (`tuple(torch.FloatTensor)`, *optional*):
        Tuple of `torch.FloatTensor` (one for the output of the image embeddings, `(batch_size, num_images,
        sequence_length, hidden_size)`.

        image_hidden_states of the model produced by the vision encoder, and optionally by the perceiver
    Nlosslogitsr%   r&   r'   r(   )r)   r*   r+   r,   r6   r   r-   r.   r/   r7   r%   r   r&   r0   r'   r(   r1   r2   r3   r5   r5   Q   s          " )-D(5$
%,,,*.FHU&'...'+OXe_+++8<M8E%"345<<<59Ju012999>B%(9":;BBBBBr2   r5   Fc                    t          j        | j        d                                       dd                              d|                              d                              | j                  }|                     d|          } |                    d          |d<   |                    d          |d<   |                    d          |d<   |                    d          |d<   d|v r!|d         }|                    d|          |d<   ||                    d|          |d	<   |d         |d                             d|          |d<   |d          |d                             d|          |d<   nO|d          |d                             d|          |d<   n'|d         |d                             d|          |d<   | |fS )
Nr   r   pixel_valuesimage_encoder_embeddingsperceiver_embeddingsimage_attention_masktoken_type_idsattention_mask)	r-   arangeshapeviewrepeattodeviceindex_selectget)	input_idsexpand_sizeis_encoder_decoderr?   encoder_outputsmodel_kwargsexpanded_return_idxr>   s           r3   expand_inputs_for_generationrN   q   s    	Y_Q'((--b!44;;A{KKPPQSTTWWXaXhii  &&q*=>>I#/#3#3N#C#CL /;/?/?@Z/[/[L+,+7+;+;<R+S+SL'(+7+;+;<R+S+SL'(<''%&67)7)D)DQH[)\)\%&!)7)D)DQH[)\)\%&*+7/;<R/S/`/`"0
 0
+, N#/'3N'C'P'PQRTg'h'h^$$	0	1	=3?@Z3[3h3h"4
 4
/00 
,	-	9/;<R/S/`/`"0
 0
+, l""r2   c                 ,   t           j        t           j        t           j        dfd|D             }|                                 D ]J|r1t          fd|D                       r                    d           5                    d           K| S )N)	LayerNormLinear	Embeddingc                      g | ]
}|         S r1   r1   ).0mmappings     r3   
<listcomp>z freeze_model.<locals>.<listcomp>   s    FFFq
FFFr2   c              3   8   K   | ]}t          |          V  d S N)
isinstance)rT   tmodules     r3   	<genexpr>zfreeze_model.<locals>.<genexpr>   s-      $]$]qZ%:%:$]$]$]$]$]$]r2   TF)r   rP   rQ   rR   modulesanyrequires_grad_)modelmodule_exceptionsmodule_exceptions_mappedrV   r\   s      @@r3   freeze_modelrd      s    \)\ G
  GFFF4EFFF--// ) ) 	)$]$]$]$]D\$]$]$]!]!] 	)!!$''''!!%((((Lr2   c                   T     e Zd ZdZ	 	 	 	 d	dee         ddf fdZd ZdefdZ	 xZ
S )
IdeficsDecoupledEmbeddinga  
    Implements a decoupling of parameters to allow freezing (or not) a subset of the embeddings. In practise, the
    regular `weight` can be trained or frozen (i.e. `partially_freeze=True`), and if `num_additional_embeddings` > 0,
    then it will create `num_additional_embeddings` additional parameters that are always trained. If
    `num_additional_embeddings=0`, then the module defaults back to the regular behavior of `nn.Embedding`.
    FNpartially_freezereturnc           	      N   |||k    rt          d| d|            t                      j        d|||||d| || _        || _        || _        || _        |r| j                            d           | j        dk    r$t          j
        | j        |||          | _        dS dS )	a)  
        Args:
            num_embeddings (`int`):
                Size of the dictionary of embeddings
            num_additional_embeddings (`int`):
                Number of additional embeddings. Only useful when you `partially_freeze=True`.
            embedding_dim (`int`):
                The size of each embedding vector
            partially_freeze: (`bool`, *optional*, defaults to `False`):
                If `True`, the regular `weight` will be frozen. `additional_weight` is never frozen.
            padding_idx (`int`, *optional*):
                The padding index (needs to be less than num_embeddings)

        Note: there are a lot of other parameters to initialize a standard `nn.Embedding` such as `padding_idx`,
        `max_norm` or `norm_type`. We are not supporting these.
        Nz/padding_idx must be within num_embeddings. Got z and )num_embeddingsembedding_dimrE   dtypepadding_idxFr   )rj   rk   rE   rl   r1   )
ValueErrorsuper__init__rj   rm   num_additional_embeddingsrg   weightr`   r   rR   additional_embedding)
selfrj   rq   rk   rg   rE   rl   rm   kwargs	__class__s
            r3   rp   z"IdeficsDecoupledEmbedding.__init__   s    6 "{^'C'Cq{qqaoqqrrr 	
)'#	
 	
 	
 	
 	
 -&)B& 0 	.K&&u---)A--(*#=+	) ) )D%%% .-r2   c                 D   | j         dk    rt          j        || j                  S |                                }t          j        || j        k              }||         }|                     || j        z
            }d||<   t          j        || j                  }|||<   |S )a  
        we have 2 embeddings, with different indices - one pretrained self.weight and another
        self.additional_embedding.weight that is being trained.

        in order to make a lookup of the input ids, we:
        1. find out the indices of the entries belonging to the 2nd embedding
        2. extract those values while subtracting the size of the first embedding (num_embeddings), since the 2nd
           embedding starts from 0 and not num_embeddings
        3. perform the 2nd embedding lookup
        4. now we handle the 1st embedding, we overwrite indices belonging to the 2nd embedding with a padding index
        5. perform the 1st embedding lookup
        6. now we overwrite the values in the 1st embedding lookup with the values of the 2nd embedding lookup

        note: for the 1st embedding lookup we could have looked up only the low indices and not do the padding, but
        then we have to create a new tensor and populate it with 2 tensors that are spread out across various indices -
        i.e. not a simple concat - I haven't benchmarked the complex case if it's any faster, given that seqlens are
        usually relatively short it's probably not faster or if faster not by much - but might be a good idea to
        measure.

        r   )	rq   F	embeddingrr   cloner-   whererj   rs   )rt   rH   additional_vocab_indicesinput_ids_additional_vocabadditional_embeddingsfull_vectors         r3   forwardz!IdeficsDecoupledEmbedding.forward   s    * )Q..;y$+666 OO%%	#(;yD<O/O#P#P %./G%H" $ 9 9:TW[Wj:j k k /0	*+k)T[99 1F,-r2   c                 F    d| j          d| j         d| j         d| j         S )Nznum_embeddings=z, num_additional_embeddings=z, embedding_dim=, partially_freeze=)rj   rq   rk   rg   rt   s    r3   
extra_reprz$IdeficsDecoupledEmbedding.extra_repr  se     A!4  A  ARVRp  A  A  CG  CU  A  A  jn  j  A  A  	Ar2   )FNNN)r)   r*   r+   r,   r   boolrp   r   strr   __classcell__rv   s   @r3   rf   rf      s          ,13 3
 #4.3 
3 3 3 3 3 3j% % %NAC A A A A A A A Ar2   rf   c                   x     e Zd ZdZ	 	 	 	 	 ddedededed	ed
df fdZdej        d
ej        fdZ	d
e
fdZ xZS )IdeficsDecoupledLineara  
    Implements a decoupling of parameters to allow freezing (or not) a subset of the parameters. In practise, the
    regular `weight` can be trained or frozen (i.e. `partially_freeze=True`), and if `out_additional_features` > 0,
    then it will create `out_additional_features * in_features` additional parameters that are always trained. If
    `out_additional_features=0`, then the module defaults back to the regular behavior of `nn.Linear`.
    r   TNin_featuresout_featuresout_additional_featuresbiasrg   rh   c                 F   t                                          |||||           || _        || _        || _        || _        |r6| j                            d           |r| j                            d           |dk    r t          j
        |||||          | _        dS dS )aG  
        out_additional_features: int. Number of additional trainable dimensions. Only makes sense when
        `partially_freeze=True`. partially_freeze: bool. If True, the regular `weight` will be frozen and extra
        parameters (if any) will be trainable. If False, default to the regular behavior of nn.Linear.
        Fr   )r   r   r   rE   rl   N)ro   rp   r   rg   r   r   rr   r`   r   r   rQ   additional_fc)	rt   r   r   r   r   rg   rE   rl   rv   s	           r3   rp   zIdeficsDecoupledLinear.__init__  s     	lD&%HHH'>$ 0&( 	0K&&u--- 0	((///"Q&&!#'4" " "D '&r2   inputc                     t          j        || j        | j                  }| j        dk    r,|                     |          }t          j        ||fd          }|S )Nr   r9   )rx   linearrr   r   r   r   r-   cat)rt   r   outputadditional_featuress       r3   r   zIdeficsDecoupledLinear.forwardC  sW    %di88'!++"&"4"4U";";Y(;<bAAFr2   c           
      Z    d| j          d| j         d| j         d| j        du d| j         
S )z=Overwriting `nn.Linear.extra_repr` to include new parameters.zin_features=z, out_features=z, out_additional_features=z, bias=Nr   r   r   r   r   rg   r   s    r3   r   z!IdeficsDecoupledLinear.extra_reprL  s     Sd.  S  St?P  S  Slp  mI  S  S  RV  R[  cg  Rg  S  S  |@  |Q  S  S  	Sr2   )r   TTNN)r)   r*   r+   r,   intr   rp   r-   Tensorr   r   r   r   r   s   @r3   r   r     s          ()!%" "" " "%	"
 " " 
" " " " " "HU\ el    SC S S S S S S S Sr2   r   c                   ,     e Zd Zd fd	Zd Zd Z xZS )IdeficsRMSNormư>c                     t                                                       t          j        t	          j        |                    | _        || _        dS )z=
        IdeficsRMSNorm is equivalent to T5LayerNorm
        N)ro   rp   r   	Parameterr-   onesrr   variance_epsilon)rt   hidden_sizeepsrv   s      r3   rp   zIdeficsRMSNorm.__init__S  sD     	l5:k#:#:;; #r2   c                 h   |                     t          j                                      d                              dd          }|t          j        || j        z             z  }| j        j        t          j	        t          j
        fv r|                     | j        j                  }| j        |z  S )N   r9   T)keepdim)rD   r-   float32powmeanrsqrtr   rr   rl   float16bfloat16)rt   r&   variances      r3   r   zIdeficsRMSNorm.forward[  s     ##EM2266q99>>r4>PP%Ht?T4T(U(UU ; ???),,T[->??M{]**r2   c                 H    t          | j        j                   d| j         S )Nz, eps=)r0   rr   rA   r   r   s    r3   r   zIdeficsRMSNorm.extra_repre  s&    )**II$2GIIIr2   )r   )r)   r*   r+   rp   r   r   r   r   s   @r3   r   r   R  sb        $ $ $ $ $ $+ + +J J J J J J Jr2   r   c                   .     e Zd Zd fd	Zd ZddZ xZS )	IdeficsEmbedding   '  Nc                    t                                                       || _        || _        || _        d| j        t          j        d| j        dt
          j                                      |t
          j	                  | j        z  z  z  }| 
                    d|d           |                     || j        j        t          j                    	           d S )
N      ?r   r   rl   rE   rl   inv_freqF
persistentseq_lenrE   rl   )ro   rp   dimmax_position_embeddingsbaser-   r@   int64rD   floatregister_buffer_set_cos_sin_cacher   rE   get_default_dtype)rt   r   r   r   rE   r   rv   s         r3   rp   zIdeficsEmbedding.__init__k  s    '>$	IQ!5;???BB&X]XcBddgkgooq
 	ZeDDD 	+DM4HPUPgPiPi 	  	
 	
 	
 	
 	
r2   c                    || _         t          j        | j         |t          j                                      | j                  }t          j        d|| j                  }t          j        ||fd          }|                     d|	                                
                    |          d           |                     d|                                
                    |          d           d S )	Nr   zi,j->ijr9   r   
cos_cachedFr   
sin_cached)max_seq_len_cachedr-   r@   r   type_asr   einsumr   r   cosrD   sin)rt   r   rE   rl   r[   freqsembs          r3   r   z#IdeficsEmbedding._set_cos_sin_cache|  s    ")L0u{SSS[[\`\ijjY4=99iB///\37799<<+>+>5QQQ\37799<<+>+>5QQQQQr2   c                     || j         k    r"|                     ||j        |j                   | j        d |                             |j                  | j        d |                             |j                  fS )Nr   r   )r   r   rE   rl   r   rD   r   )rt   xr   s      r3   r   zIdeficsEmbedding.forward  s}    T,,,##GAHAG#TTT OHWH%((qw(77OHWH%((qw(77
 	
r2   )r   r   NrY   )r)   r*   r+   rp   r   r   r   r   s   @r3   r   r   j  sc        
 
 
 
 
 
"R R R
 
 
 
 
 
 
 
r2   r   c                     | dd| j         d         dz  f         }| d| j         d         dz  df         }t          j        | |fd          S )z*Rotates half the hidden dims of the input..Nr9   r   r   )rA   r-   r   )r   x1x2s      r3   rotate_halfr     s]    	
3"!'"+"""	#B	
3q """	#B9rc2YB''''r2   c                     ||                              |          }||                              |          }| |z  t          |           |z  z   }||z  t          |          |z  z   }||fS )an  Applies Rotary Position Embedding to the query and key tensors.

    Args:
        q (`torch.Tensor`): The query tensor.
        k (`torch.Tensor`): The key tensor.
        cos (`torch.Tensor`): The cosine part of the rotary embedding.
        sin (`torch.Tensor`): The sine part of the rotary embedding.
        position_ids (`torch.Tensor`):
            The position indices of the tokens corresponding to the query and key tensors. For example, this can be
            used to pass offsetted position ids when working with a KV-cache.
        unsqueeze_dim (`int`, *optional*, defaults to 1):
            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
    Returns:
        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
    )	unsqueezer   )qkr   r   position_idsunsqueeze_dimq_embedk_embeds           r3   apply_rotary_pos_embr     sq    * l

%
%m
4
4C
l

%
%m
4
4C3w;q>>C/0G3w;q>>C/0GGr2   c                   2     e Zd Zdededef fdZd Z xZS )
IdeficsMLPr   intermediate_size
hidden_actc                    t                                                       t          j        ||d          | _        t          j        ||d          | _        t          j        ||d          | _        t          |         | _        d S )NFr   )	ro   rp   r   rQ   	gate_proj	down_projup_projr
   act_fn)rt   r   r   r   rv   s       r3   rp   zIdeficsMLP.__init__  sv     	;0ANNN#4kNNNy.?eLLLZ(r2   c                     |                      |                     |                     |                    |                     |          z            S rY   )r   r   r   r   )rt   r   s     r3   r   zIdeficsMLP.forward  s;    ~~dkk$..*;*;<<t||ANOOOr2   )r)   r*   r+   r   r   rp   r   r   r   s   @r3   r   r     so        
)
) 
) 	
) 
) 
) 
) 
) 
)P P P P P P Pr2   r           r\   querykeyvaluer?   scalingdropoutc                    t          j        ||                    dd                    |z  }|||z   }t          j                            |dt           j                                      |j                  }t          j        	                    ||| j
                  }t          j        ||          }	|	                    dd                                          }	|	|fS )Nr9   )r   rl   ptrainingr   r   )r-   matmul	transposer   
functionalsoftmaxr   rD   rl   r   r   
contiguous)
r\   r   r   r   r?   r   r   ru   attn_weightsattn_outputs
             r3   eager_attention_forwardr     s     <s}}R'<'<==GL!#n4=((2U](SSVVW\WbccL=((6?([[L,|U33K''1--88::K$$r2   c                   |    e Zd ZdZ	 	 	 	 	 ddedededed	ee         d
edee         f fdZ	de
j        dedefdZ eddd          	 	 	 	 	 dde
j        dee
j                 dee
j                 dee
j                 dee         dee
j                 dee         dee
j        e
j        f         fd            Z xZS )IdeficsAttentionz=Multi-headed attention from 'Attention Is All You Need' paperr   FNr   	num_headsr   is_cross_attentionconfigqk_layer_norms	layer_idxc                 0   t                                                       || _        || _        || _        ||z  | _        || _        d| _        | j        dz  | _        || _	        |(t                              d| j        j         d           | j        |z  | j        k    rt          d| j         d| d          || _        t!          t"          j        d          st          d	          | j        rt!          |j        d
          s| j        n|j        j        }t#          j        | j        || j        z  d          | _        t#          j        ||| j        z  d          | _        t#          j        ||| j        z  d          | _        n{t#          j        | j        || j        z  d          | _        t#          j        | j        || j        z  d          | _        t#          j        | j        || j        z  d          | _        t#          j        || j        z  |d          | _        t5          | j                  | _        || _        | j        rBt;          | j        |j                  | _        t;          | j        |j                  | _         d S d S )NTg      zInstantiating z without passing a `layer_idx` is not recommended and will lead to errors during the forward call if caching is used. Please make sure to provide a `layer_idx` when creating this class.z?hidden_size must be divisible by num_heads (got `hidden_size`: z and `num_heads`: z).scaled_dot_product_attentionz)this model requires pytorch 2.0 or higher	embed_dimFr   r   )!ro   rp   r   r   r   head_dimr   	is_causalr   r  loggerwarning_oncerv   r)   rn   r   hasattrr   r   vision_configr  rQ   q_projk_projv_projo_projr   
rotary_embr  r   rms_norm_epsq_layer_normk_layer_norm)
rt   r   r   r   r   r   r  r  kv_input_dimrv   s
            r3   rp   zIdeficsAttention.__init__  s    	&"#y0}d*",!8 , , ,   MI%$*:::3RVRb 3 3%.3 3 3  
 #5r}&DEE 	JHIII" 	(/0Dk(R(Rv  X^XlXv  ) DM)  DK
 )L)dm2KRWXXXDK)DM)  DKK ) DM)  DK
 ) DM)  DK
 ) DM)  DK
 i%
 
 

 +4=99, 	W .t}&BU V V VD .t}&BU V V VD	W 	Wr2   tensorr   bszc                     |                     ||| j        | j                                      dd                                          S )Nr   r   )rB   r   r  r   r   )rt   r  r   r  s       r3   _shapezIdeficsAttention._shape3  s<    {{3GGQQRSUVWWbbdddr2   past_key_valuer%   4.58new_nameversionr&   key_value_statesr?   r   cache_positionru   rh   c                    | j         p|d u}|                                \  }	}
}|                     |                              |	|
| j        | j                                      dd          }|s|                     |                              |	|
| j        | j                                      dd          }|                     |                              |	|
| j        | j                                      dd          }n|                                \  }}}|                     |                              |	|| j        | j                                      dd          }|                     |                              |	|| j        | j                                      dd          }|j	        d         }|||d         z  }|s>| 
                    |t          ||
                    \  }}t          |||||          \  }}|$d|i}|                    ||| j        |          \  }}| j        r*|                     |          }|                     |          }t$          }| j        j        dk    rt*          | j        j                 } || ||||f| j        sdn| j        | j        d	|\  }}|                    |	|
d
                                          }|                     |          }||fS )Nr   r   r   r   )r   r   eagerr   )r   r   r9   )r   sizer  rB   r   r  r   r  r  rA   r  maxr   updater  r  r  r  r   r   _attn_implementationr   r   r   r   reshaper   r  )rt   r&   r  r?   r   r%   r   ru   r   r  q_len_query_states
key_statesvalue_stateskv_len
kv_seq_lenr   r   cache_kwargsattention_interfacer   r   s                          r3   r   zIdeficsAttention.forward6  s    "4T8HPT8T%**,,UA{{=1166sE4>SWS`aakklmopqq! 	]3388eT^UYUbccmmnoqrssJ;;}55::3t~W[WdeeoopqstuuLL+0022LAvq%566;;CY]YfggqqrsuvwwJ,--223PTP]^^hhijlmnn   %b)
&.++J! 	n|SU=S=STTHC';L*VY[^`l'm'm$L* &,n=L'6'='=j,X\Xfht'u'u$J 	7,,\::L**:66J(?;+w66"9$+:Z"[$7$7	%
  $}>CC$,L	%
 	%
 	%
 	%
!\ "))#ub99DDFFkk+..L((r2   )r   FNFNNNNNN)r)   r*   r+   r,   r   r   r   r   r   rp   r-   r   r  r   
LongTensorr   r   r   r0   r   r   r   s   @r3   r   r     s       GG #(-1$#'OW OWOW OW 	OW
 !OW )*OW OW C=OW OW OW OW OW OWbeU\ eC ec e e e e _%0A6RRR 481537+/59?) ?)|?) #5<0?) !.	?)
 u/0?) "%?) !!12?) +,?) 
u|U\)	*?) ?) ?) SR?) ?) ?) ?) ?)r2   r   c                   
    e Zd Zddedee         f fdZ eddd          e	 	 	 	 dd	e	j
        d
ee	j
                 dee	j                 dee         dee	j                 dee         de	j        fd                        Z xZS )IdeficsDecoderLayerNr   r  c                    t                                                       |j        | _        t          | j        |j        |j        ||          | _        t          | j        |j        |j	                  | _
        t          |j        |j                  | _        t          |j        |j                  | _        |j        | _        d S )N)r   r   r   r   r  r   r   r   r  )ro   rp   r   r   num_attention_headsr   	self_attnr   r   r   mlpr   r  input_layernormpost_attention_layernormrt   r   r  rv   s      r3   rp   zIdeficsDecoderLayer.__init__{  s    !-)(0N
 
 
 ($6(
 
 

  .f.@fFYZZZ(6v7IvOb(c(c(c%~r2   r  r%   r  r  r&   r?   r   r   ru   rh   c           	      |   |}|                      |          } | j        d|||||d|\  }}t          j                            || j        | j                  }||z   }|}|                     |          }|                     |          }t          j                            || j        | j                  }||z   }|S )N)r&   r?   r   r%   r   r   r1   )r:  r8  r   r   r   r   r;  r9  )	rt   r&   r?   r   r%   r   ru   residualr)  s	            r3   r   zIdeficsDecoderLayer.forward  s     !,,];; *4> 
')%+)
 
 
 
q --mt|VZVc-dd =0 !55mDD//--mt|VZVc-dd =0r2   rY   )NNNN)r)   r*   r+   r   r   r   rp   r   r   r-   r   r2  r   r   r   r.   r   r   r   s   @r3   r4  r4  z  s       & &} &# & & & & & && _%0A6RRR 2637+/59   |  !.  u/0	 
 "%  !!12  +,  
	      ^ SR         r2   r4  c                   &    e Zd Zddedee         f fdZ eddd          e	 	 	 	 	 dd	e	j
        d
ee	j
                 dee	j
                 dee	j
                 dee	j
                 dee         dee         de	j        fd                        Z xZS )IdeficsGatedCrossAttentionLayerNr   r  c           	         t                                                       |j        | _        t          | j        |j        d|j        ||j        |          | _        t          | j        |j	        |j
                  | _        t          |j        |j                  | _        t          |j        |j                  | _        |j        | _        t#          j                    | _        t#          j                    | _        |j        dk    r|j        dk    rft#          j        t1          j        dd| j                            | _        t#          j        t1          j        dd| j                            | _        n|j        dk    rXt#          j        t1          j        d                    | _        t#          j        t1          j        d                    | _        n=t9          d	|j         d
          |j        dk    r|j        dk    rft#          j        t1          j        dd| j                            | _        t#          j        t1          j        dd| j                            | _        n|j        dk    rXt#          j        t1          j        d                    | _        t#          j        t1          j        d                    | _        nFt9          d	|j         d
          |j        dv r|j        dk    rwt#          j        t1          j        d|j        dd| j        f                    | _        t#          j        t1          j        d|j        dd| j        f                    | _        n|j        dk    rgt#          j        t1          j        d|j        d                    | _        t#          j        t1          j        d|j        d                    | _        n0t9          d	|j         d
          tA          d|j         d          tC          | d          rtC          | d          st9          d          d S )NT)r   r   r   r   r   r  r  r6  r  zerosvectorr   r   z Unknown value for `alpha_type` ()r   >   normalrandomgaussianr   )r   stdr#  zAlpha initialization scheme z not yet implemented!alpha_cross_attnalpha_densez+Alpha parameters not initialized correctly!)"ro   rp   r   r   r7  r   r  
cross_attnr   r   r   r9  r   r  r:  r;  r   r   Tanhact_cross_attn	act_densealpha_initializer
alpha_typer   r-   rB  rI  rJ  rn   r   rE  alphas_initializer_rangeNotImplementedErrorr  r<  s      r3   rp   z(IdeficsGatedCrossAttentionLayer.__init__  s   !-*(0#N!0
 
 
 ($6(
 
 

  .f.@fFYZZZ(6v7IvOb(c(c(c%n gii#w.. H,,(*U[AtGW5X5X(Y(Y%#%<Aq$BR0S0S#T#T  "g--(*U[^^(D(D%#%<A#?#?   !XFDU!X!X!XYYY%// H,,(*UZ1dFV5W5W(X(X%#%<
1aAQ0R0R#S#S  "g--(*UZ]](C(C%#%<
1#>#>   !XFDU!X!X!XYYY%)III H,,(*Lcv/NVWYZ\`\lUmnnn) )% $&<Lcv/NVWYZ\`\lUmnnn$ $   "g--(*Lcv/NVWYYY) )% $&<#6Kjrs0u0u0u#v#v   !XFDU!X!X!XYYY &&tVE]&t&t&tuuu011 	LgdM6R6R 	LJKKK	L 	Lr2   r  r%   r  r  r&   r?   r(   r=   cross_attention_gateru   rh   c                    |t          d          |t          d          |t          d          |}|                     |          } | j        d	|||d|\  }}	t          j                            || j        | j                  }|	                    |dk    dddddf         d          }|| 
                    | j                  |z  z   }|}|                     |          }|                     |          }t          j                            || j        | j                  }||                     | j                  |z  z   }|S )
a  
        image_hidden_states (`torch.FloatTensor`):
            Input to the layer of shape `(batch, seq_len, embed_dim)`
        image_attention_mask (`torch.FloatTensor`, *optional*):
            image attention mask of size
            `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
        cross_attention_gate (`torch.FloatTensor`, *optional*):
            gate of size `(batch, seq_len)` used to zero-out cross-attention output for tokens attending no images.
        Nzt`image_hidden_states` is required for Idefics cross attention module which are visual features to be conditioned on.z`cross_attention_gate` is required for Idefics cross attention module to zero-out the cross-attention hidden_states attending to no images.zMPast key value states are not implemented for Idefics cross attention module.)r&   r  r?   r   r   r   r1   )rn   rR  r:  rK  r   r   r   r   r   masked_fillrM  rI  r;  r9  rN  rJ  )
rt   r&   r?   r(   r=   rS  r%   ru   r>  r)  s
             r3   r   z'IdeficsGatedCrossAttentionLayer.forward  s   * &#  
  ' ^   &%&uvvv ,,];; +4? 
'0/
 
 	
 
q --mt{UYUb-cc%113G13LaaaQRQRQRTXj2Y[^__ 4#6#6t7L#M#MP]#]] !55mDD//--mt{UYUb-cc 4>>$2B#C#Cm#SSr2   rY   r1  )r)   r*   r+   r   r   r   rp   r   r   r-   r   r   r   r   r.   r   r   r   s   @r3   r@  r@    s5       @L @L} @L# @L @L @L @L @L @LD _%0A6RRR 266:7;7;+/8 8|8 !.8 &el3	8
 'u|48 'u|48 "%8 +,8 
	8 8 8 ^ SR8 8 8 8 8r2   r@  c                   b    e Zd ZU eed<   dZdZddgZdZdZ	dZ
dZe eedd	          d
Zd ZdS )IdeficsPreTrainedModelr   ra   Tr4  r@  Fr   r8  )index
layer_name)r&   r'   c                    | j         j        }t          |t          j        t          j        f          rJ|j        j                            d|           |j	         |j	        j        
                                 d S d S t          |t          j                  rU|j        j                            d|           |j        +|j        j        |j                 
                                 d S d S t          |t          j                  r?|j        j                            d           |j	        j        
                                 d S t          |t                    r!|j        j                            d           d S t          |t                     r |j        j                                         d S t          |t$                    r| j         j        dk    r>|j        j        
                                 |j        j        
                                 d S | j         j        dk    r@|j        j                            d           |j        j                            d           d S | j         j        dv rX|j        j                            d| j         j                   |j        j                            d| j         j                   d S d S t          |t.                    r |j        j                                         d S d S )Nr   )r   rH  r   rB  r   >   rE  rF  rG  )r   initializer_rangerZ   r   rQ   Conv2drr   datanormal_r   zero_rR   rm   rP   fill_r   r   class_embeddingr@  rO  rI  rJ  rQ  r   latents)rt   r\   rH  s      r3   _init_weightsz$IdeficsPreTrainedModel._init_weightsD  s    k+fry")455 	*M&&CS&999{& &&((((( '&-- 	*M&&CS&999!-"6#56<<>>>>> .--- 	*M$$S)))K""$$$$$// 	*M$$S))))) 788 	*"'//11111 ?@@ 	*{,77',22444"'--/////.&88',223777"'--c22222.2RRR',44#4;Cg4hhh"'//Sdk>b/ccccc SR  9:: 	*N'')))))	* 	*r2   N)r)   r*   r+   r   r/   base_model_prefixsupports_gradient_checkpointing_no_split_modules_supports_sdpa_supports_flash_attn_can_compile_fullgraph_supports_attention_backendr4  r   r   _can_record_outputsrc  r1   r2   r3   rW  rW  3  s         &*#.0QRN ""& -$n%5Q;WWW 
* * * * *r2   rW  c            !           e Zd ZdZdef fdZddZg fdZg fdZe	e
	 	 	 	 	 	 	 	 	 	 	 	 dd	eej                 d
eej                 deej                 dee         deej                 deej                 deej                 deej                 deej                 dee         dee         deej                 dee         deeef         fd                        Z xZS )IdeficsModelz
    Transformer decoder consisting of `config.num_hidden_layers` layers. Each layer is a [`IdeficsDecoderLayer`]

    Args:
        config: IdeficsConfig
    r   c                    t                                                     | _        j        | _        j        | _        t          j        j        j        j	        | j                  | _
        j        j        | _        j        | _        j        | j        _        t          j                  | _        j        r>j        }t%          j        j        |j        |j        |j        |j                  | _        t3          j        fdt7          j                  D                       | _        j        | _        j        | j        z  }t3          j        fdt7          |          D                       | _        d| _         tC          j        j"                  | _#        | $                                 | %                               d S )N)rj   rq   rk   rg   rm   c                 2    g | ]}t          |           S )r  )r4  rT   ir   s     r3   rW   z)IdeficsModel.__init__.<locals>.<listcomp>  s'    ___! 1555___r2   c                 2    g | ]}t          |           S rp  )r@  rq  s     r3   rW   z)IdeficsModel.__init__.<locals>.<listcomp>  s'    ccca,VqAAAcccr2   Fr  )&ro   rp   r   pad_token_idrm   
vocab_sizerf   additional_vocab_sizer   freeze_text_layersembed_tokensr  
image_sizer&  r    vision_modeluse_resamplerperceiver_configr   r  resampler_depthresampler_n_headsresampler_head_dimresampler_n_latentsperceiver_resamplerr   
ModuleListrangenum_hidden_layerslayerscross_layer_intervalgated_cross_attn_layersgradient_checkpointingr   r  norm	post_initfreeze_relevant_params)rt   r   r|  num_cross_layersrv   s    `  r3   rp   zIdeficsModel.__init__o  s      !. +5!,&,&B ,#6(
 
 
 !.9#1282M/4V5IJJ  		%6'@$. 0 2 3 4( (D$ m____uVE]?^?^___
 
 %+$?!!3t7PP')}cccc5QaKbKbccc(
 (
$ ',#"6#56;NOOO	 	##F+++++r2   Nc                     || j         }|j        r|                     |j                   |j        rt	          | j        |j                   d S d S N)rb   )r   rw  freeze_text_module_exceptionsfreeze_vision_layersrd   rz  freeze_vision_module_exceptions)rt   r   s     r3   r  z#IdeficsModel.freeze_relevant_params  sh    >[F$ 	J##F$HIII& 	f*f>deeeeee	f 	fr2   c                 J    | j         | j        fD ]}t          ||           d S r  )r  r  rd   )rt   rb   r\   s      r3   rw  zIdeficsModel.freeze_text_layers  s?    {DI. 	F 	FF3DEEEEE	F 	Fr2   c                 2    t          | j        |           d S r  )rd   rz  )rt   rb   s     r3   r  z!IdeficsModel.freeze_vision_layers  s    T&:KLLLLLLr2   FrH   r?   r   r%   inputs_embedsr:   r;   r<   r=   	use_cacheinterpolate_pos_encodingr   ru   rh   c           	      x	   ||j         n|j         }|du |duz  rt          d          ||                     |          }|
r|t          | j                  }|j        \  }}}||                                nd}||z   }|*t          j        |||j        d         z   |j                   }|V|T|	                                
                    d          dz
  }|                    |dk    d           |dd| df         }n||                    d          }t          d |||fD                       d	k    rt          d
          |{|                    | j        |          }|j        dd	         \  }} |                                j        ||z  g|j        d	d         R  }|                     ||          j        }nQ|O|                                \  }}}}|                    | j        |          }|                    ||z  ||          }| j        j        r^|@|                     |          }|                    d          |                    d	          }}n|                                \  }}}}|}n<|+|                    d          |                    d	          }}nt          d          |                    |||z  |          }|	                    d          }|	                    d          }	|	                    ddd|          }	|	                    ||||z            }	|J|                                \  }}}||f}|	t          j        ||          }	|                     |	          }	nd}	|	dk                        d                              | j                                      d                              |          }|(t          j        ||ft          j        |j                   }t;          | j        |||||          }|}t=          | j                  D ]E\  }} || j         z  dk    r$| j!        || j         z           }! |!|||f|	|dd|} | |f||||d|}F| "                    |          }|                    ||||          }tG          |||          S )ab  
        image_encoder_embeddings (`torch.FloatTensor`, *optional*):
            The output of the image encoder.
        perceiver_embeddings (`torch.FloatTensor`, *optional*):
            The output of the perceiver resampler.
        image_attention_mask (`torch.LongTensor`, *optional*):
            The attention mask for the image encoder.
        Nz:You must specify exactly one of input_ids or inputs_embeds)r   r   r   )rE   r9   c              3      K   | ]}|d u V  	d S rY   r1   )rT   r   s     r3   r]   z'IdeficsModel.forward.<locals>.<genexpr>  s&      aaQqDyaaaaaar2   r   z_Exactly 1 of pixel_values, image_encoder_embeddings or perceiver_embeddings has to be not-None.)rl   rE   )r:   r  zBIf `perceiver_embeddings` are passed, use_resampler should be Truer   r   r   )r   input_embedsr?   r   r%   r   )r=   rS  r%   )r?   r   r%   r   )r$   r(   r%   )$rE   rn   rx  r   r   rA   get_seq_lengthr-   r@   longcumsummasked_fill_r   sumrD   rl   r   rB   rz  r$   r#  r{  r  rC   r   invert_attention_maskr_   squeezer   r   	enumerater  r  r  r  r#   )"rt   rH   r?   r   r%   r  r:   r;   r<   r=   r  r  r   ru   rE   
batch_size
seq_lengthr)  past_key_values_lengthseq_length_with_past
num_imagesr(   image_seq_lenimage_hidden_sizetext_seq_lenimage_batch_sizeimage_sequence_lengthimage_hidden_shaperS  causal_maskr&   idxdecoder_layercross_attn_blocks"                                     r3   r   zIdeficsModel.forward  s   4 &/%:!!@T-t";< 	[YZZZ  --i88M 	?0*$+>>>O$1$7!
JETE`!?!?!A!A!Afg),BB!"\&(>ATUVAW(W`m`t  N %,*>)..0077;;a?L%%n&91==='J;<<8LL!)33A66Laa<1IK_"`aaaaaefffq   %'??F?KKL%1%7%;"J
9<22449*z:QkT`TfghgigiTjkkkL #'"3"3)D\ #4 # #   &1G_GdGdGfGfDJ
M3D":"="=DJW]"="^"^"5":"::
;RTact"u"u;$ 
	c#+'+'?'?@S'T'T$3G3L3LQ3O3OQeQjQjklQmQm0K_KdKdKfKfH
J7H"6!)/B/G/G/J/JL_LdLdefLgLg,MMabbb166z:P]C]_pqq ,00333==bAA3::1aMRR388\S]`mSmnn*9L9Q9Q9S9S63Q"24I!J#+',z2DV'T'T'T$#'#=#=>R#S#S  #'  $83#>"C"C"C"K"K!O!OVZV`!O!a!a j jop j q quu 
  

 !"Z12%*]Ma  N );&))+%
 
 
 &"+DK"8"8 	 	CT..!33#'#?tG`@`#a  0 0!'! *>)=$(! ! ! ! *M*) /-   MM 		-00166z:}^opp-+ 3+
 
 
 	
r2   rY   )NNNNNNNNNNFN)r)   r*   r+   r,   r   rp   r  rw  r  r   r   r   r-   r2  r   r   r.   r   r   r   r   r0   r#   r   r   r   s   @r3   rm  rm  f  s        0,} 0, 0, 0, 0, 0, 0,df f f f 46 F F F F 68 M M M M  151537+/5948@D<@7;$(3859^
 ^
E,-^
 !.^
 u/0	^

 "%^
   12^
 u01^
 #+5+<"=^
 'u'89^
 'u|4^
 D>^
 #+4.^
 !!12^
 +,^
 
u44	5^
 ^
 ^
 ^ ^
 ^
 ^
 ^
 ^
r2   rm  c            #       (    e Zd ZddgZd fd	Zd Zee	 	 	 	 	 	 	 	 	 	 	 	 	 ddee	j
                 dee	j                 d	ee	j
                 d
ee         dee	j                 dee	j                 dee	j                 dee	j                 dee	j                 dee	j
                 dee         dee         dee	j
                 dee         deeef         fd                        Z	 	 	 	 	 	 	 	 	 d fd	Z	 ddedeeef         dedeeef         f fdZ xZS ) IdeficsForVisionText2Textzmodel.embed_tokens.weightzlm_head.weightNc                     t                                          |           t          |          | _        t	          |j        |j        |j        d|j                  | _	        | 
                                 d S )NFr   )ro   rp   rm  ra   r   r   ru  rv  freeze_lm_headlm_headr  )rt   r   rz  rv   s      r3   rp   z"IdeficsForVisionText2Text.__init__X  ss       !&))
-**$*$@#2
 
 
 	r2   c                    |                                  }|                                 }t          | j        dd          r?|j        |_        |j        dk    r(|j        |j        k    sJ |j        j        |j        _        t          |d          rJt          |d          r<|j
        |_        t          |d          r"t          |d          r|j        |_        dS dS dS dS dS )	z
        Overwrite `transformers.modeling_utils.PreTrainedModel.tie_weights` to handle the case of
        IdeficsDecoupledLinear and IdeficsDecoupledEmbedding.
        tie_word_embeddingsTr   r   rj   r   rq   N)get_output_embeddingsget_input_embeddingsgetattrr   rr   rq   r   rs   r   r  rj   r   )rt   output_embeddingsinput_embeddingss      r3   tie_weightsz%IdeficsForVisionText2Text.tie_weightsg  s,   
 !668844664; 5t<< 	f'7'>$9A==(@DTDnnnnn9I9^9e!/6$n55 	g'BRTd:e:e 	g-=-L*(*CDD g "=J J g =M<f!999	g 	g 	g 	gg g g gr2   FrH   r?   r   r%   r  r:   r;   r<   r=   labelsr  r  r   ru   rh   c                     | j         d|||||||||	||d|d|}|d         }|                     |          }d}|
 | j        d||
| j        j        d|}t          |||j        |j        |j        |j	                  S )aC  
        image_encoder_embeddings (`torch.FloatTensor`, *optional*):
            The output of the image encoder.
        perceiver_embeddings (`torch.FloatTensor`, *optional*):
            The output of the perceiver resampler.
        image_attention_mask (`torch.LongTensor`, *optional*):
            The attention mask for the image encoder.
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.

        Example:

        ```python
        >>> from transformers import AutoProcessor, IdeficsForVisionText2Text

        >>> model = IdeficsForVisionText2Text.from_pretrained("HuggingFaceM4/idefics-9b")
        >>> processor = AutoProcessor.from_pretrained("HuggingFaceM4/idefics-9b")

        >>> dogs_image_url_1 = "https://huggingface.co/datasets/hf-internal-testing/fixtures_nlvr2/raw/main/image1.jpeg"
        >>> dogs_image_url_2 = "https://huggingface.co/datasets/hf-internal-testing/fixtures_nlvr2/raw/main/image2.jpeg"

        >>> prompts = [
        ...     [
        ...         "User:",
        ...         dogs_image_url_1,
        ...         "Describe this image.\nAssistant: An image of two dogs.\n",
        ...         "User:",
        ...         dogs_image_url_2,
        ...         "Describe this image.\nAssistant:",
        ...     ]
        ... ]
        >>> inputs = processor(prompts, return_tensors="pt")
        >>> generate_ids = model.generate(**inputs, max_new_tokens=6)
        >>> processor.batch_decode(generate_ids, skip_special_tokens=True)
        ```T)rH   r?   r   r%   r  r:   r;   r<   r=   r  r  return_dictr   r   N)r7   r  ru  )r6   r7   r%   r&   r'   r(   r1   )
ra   r  loss_functionr   ru  r5   r%   r&   r'   r(   )rt   rH   r?   r   r%   r  r:   r;   r<   r=   r  r  r  r   ru   outputsr&   r7   r6   s                      r3   r   z!IdeficsForVisionText2Text.forward|  s    r $* 
)%+'%%=!5!5%=)
 
 
 
"  
m,,%4%pVFt{OeppioppD,#3!/) ' ;
 
 
 	
r2   c                    i }|| j         j        r||d<   n||d<   n||d<   |                    dd          |d<    t                      j        |f||||||
|	d||}|	'|%|d         j        d         }|	d d | d f         |d	<   |S )
Nr<   r;   r:   r  F)r%   r?   r  r   r   r  r=   rH   r   r=   )r   r{  popro   prepare_inputs_for_generationrA   )rt   rH   r?   r   r  r%   r   r:   r(   r=   r  ru   images_kwargsmodel_inputsr  rv   s                  r3   r  z7IdeficsForVisionText2Text.prepare_inputs_for_generation  s      *{( P8K455<O899,8M.)4:JJ?Y[`4a4a01<uww<
+)')%!5
 
 
 
 
  +0E%k28;J3GJ;<<3XL/0r2   r  rL   rJ   c                     t                      j        |||fi |}d|v ra|d         }|d d dd d f                             d          }|                    dd          r||d<   nt	          j        ||gd          |d<   |j        |d<   |S )Nr=   r9   r   r  Tr   r(   )ro   #_update_model_kwargs_for_generationr   rG   r-   r   r(   )rt   r  rL   rJ   ru   r=   	last_maskrv   s          r3   r  z=IdeficsForVisionText2Text._update_model_kwargs_for_generation  s     CuwwB
 
 	
 
 "\11#/0F#G ,QQQAAAX6@@CCIT22 k7@3447<yBVXaAbhi7j7j7j34 /6.I*+r2   rY   )NNNNNNNNNNNFN)	NNNNNNNNN)F)r)   r*   r+   _tied_weights_keysrp   r  r   r   r   r-   r2  r   r   r.   r   r   r   r   r0   r5   r   r  r   dictr   r   r  r   r   s   @r3   r  r  U  sb       57GH     g g g*  151537+/5948@D<@7;-1$(3859V
 V
E,-V
 !.V
 u/0	V

 "%V
   12V
 u01V
 #+5+<"=V
 'u'89V
 'u|4V
 )*V
 D>V
 #+4.V
 !!12V
 +,V
  
u33	4!V
 V
 V
 ^ V
v  !+ + + + + +b $)	  38n !	 
c3h         r2   r  )r  rm  rW  )r   FNN)r   )r   )Lr,   dataclassesr   typingr   r   r   r   r-   torch.nn.functionalr   r   rx   activationsr
   cache_utilsr   r   
generationr   masking_utilsr   modeling_layersr   modeling_outputsr   modeling_utilsr   r   r   processing_utilsr   utilsr   r   r   r   utils.deprecationr   utils.genericr   r   configuration_ideficsr   	perceiverr   visionr   r    
get_loggerr)   r	  r#   r5   rN   rd   rR   rf   rQ   r   Moduler   r   r   r   r   r   r   r   r   r4  r@  rW  rm  r  __all__r1   r2   r3   <module>r     s  (   ! ! ! ! ! ! 1 1 1 1 1 1 1 1 1 1 1 1                 ! ! ! ! ! ! . . . . . . . . ) ) ) ) ) ) / / / / / / 9 9 9 9 9 9 + + + + + + X X X X X X X X X X & & & & & & R R R R R R R R R R R R 0 0 0 0 0 0 ? ? ? ? ? ? ? ? 0 0 0 0 0 0 0 0 0 0 0 0 E E E E E E E E 
	H	%	%   
C C C C C[ C C  C6   
C C C C CK C C  C8 *# *# *# *#Z +-    fA fA fA fA fA fA fA fAR8S 8S 8S 8S 8SRY 8S 8S 8SxJ J J J JRY J J J0$
 $
 $
 $
 $
ux $
 $
 $
N( ( (   :P P P P P P P P2 % %I%<% 
% <	%
 U\*% % % % % %0W) W) W) W) W)ry W) W) W)v6 6 6 6 64 6 6 6r} } } } }&@ } } }@ /* /* /* /* /*_ /* /* /*d k
 k
 k
 k
 k
) k
 k
 k
\F F F F F 6 F F FR R
Q
Qr2   