
     `i@                       d Z ddlZddlmZ ddlmZmZmZmZ ddl	Z	ddl	m
Z
 ddlmZ ddlmZmZmZ dd	lmZ dd
lmZ ddlmZ ddlmZmZmZmZ ddlmZmZ ddlm Z  ddl!m"Z"m#Z#m$Z$m%Z%m&Z&m'Z' ddl(m)Z) ddl*m+Z+m,Z,m-Z-  e&j.        e/          Z0dTde	j1        de	j2        dee3         fdZ4	 dUde	j5        de	j2        de	j6        de3fdZ7dUdZ8e e$d           G d de"                                  Z9e e$d            G d! d"e"                                  Z: G d# d$e
j;                  Z<	 dVd&e
j;        d'e	j1        d(e	j1        d)e	j1        d*ee	j1                 d+e=d,e=fd-Z> G d. d/e
j;                  Z? G d0 d1e
j;                  Z@ G d2 d3e          ZA G d4 d5e
j;                  ZB G d6 d7e
j;                  ZC G d8 d9e
j;                  ZD G d: d;e
j;                  ZE G d< d=e
j;                  ZF G d> d?e          ZG G d@ dAe
j;                  ZHe$ G dB dCe                      ZI G dD dEeI          ZJ G dF dGeI          ZK e$dH           G dI dJeIe                      ZL G dK dLe
j;                  ZM e$dM           G dN dOeI                      ZN e$dP           G dQ dReIe                      ZOg dSZPdS )WzPyTorch KOSMOS-2 model.    N)	dataclass)AnyCallableOptionalUnion)nn   )ACT2FN)CacheDynamicCacheEncoderDecoderCache)GenerationMixin)FlashAttentionKwargs)GradientCheckpointingLayer)BaseModelOutput)BaseModelOutputWithPastAndCrossAttentionsBaseModelOutputWithPooling!CausalLMOutputWithCrossAttentions)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)ModelOutputTransformersKwargsauto_docstringcan_return_tuplelogging	torch_int)deprecate_kwarg   )Kosmos2ConfigKosmos2TextConfigKosmos2VisionConfigmaskdtypetgt_lenc                 L   |                                  \  }}||n|}| ddddddf                             |d||                              |          }d|z
  }|                    |                    t          j                  t	          j        |          j                  S )z_
    Expands attention_mask from `[bsz, seq_len]` to `[bsz, 1, tgt_seq_len, src_seq_len]`.
    Nr         ?)sizeexpandtomasked_filltorchboolfinfomin)r#   r$   r%   bszsrc_lenexpanded_maskinverted_masks          /home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/transformers/models/kosmos2/modeling_kosmos2.py_expand_maskr5   -   s     99;;LC ,gg'GD$)*11#q'7KKNNuUUM-'M$$]%5%5ej%A%A5;uCUCUCYZZZ    input_ids_shapedevicepast_key_values_lengthc                 *   | \  }}t          j        ||ft          j        |          j        |          }t          j        |                    d          |          }|                    ||dz                       |                    d          d          k     d           |                    |          }|dk    r.t          j	        t          j
        ||||          |gd          }|ddddddf                             |d|||z             S )zB
    Make causal mask used for bi-directional self-attention.
    )r8   r   r   r$   r8   dimN)r,   fullr.   r/   aranger(   masked_fill_viewr*   catzerosr)   )r7   r$   r8   r9   r0   r%   r#   	mask_conds           r4   _make_causal_maskrF   ;   s    #LC:w(%+e*<*<*@PPPDTYYr]]6:::Ii9q="6"6tyy}}a"H"HH!LLL775>>D!!y%+g/EU[abbbdhioqrrrdAAAqqq !((a'DZ:Z[[[r6   c                     |                      |                                          }t          j        |d                              |          |z   |z  }|                                |z   S )a  
    Replace non-padding symbols with their position numbers. Position numbers begin at padding_idx+1. Padding symbols
    are ignored. This is modified from fairseq's `utils.make_positions`.

    Args:
        x: torch.Tensor x:

    Returns: torch.Tensor
    r   r=   )neintr,   cumsumtype_aslong)	input_idspadding_idxr9   r#   incremental_indicess        r4   "create_position_ids_from_input_idsrP   M   sg     <<$$((**D <!444<<TBBE[[_cc##%%33r6   ze
    Base class for text model's outputs that also contains a pooling of the last hidden states.
    )custom_introc                   ,   e Zd ZU dZdZeej                 ed<   dZ	ee
         ed<   dZeeej                          ed<   dZeeej                          ed<   dZeej                 ed<   dZeeej                          ed<   dZeed	<   d
ee         fdZdS )Kosmos2ModelOutputa  
    past_key_values (`Cache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
        It is a [`~cache_utils.Cache`] instance. For more details, see our [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache).

        Contains pre-computed hidden-states (key and values in the self-attention blocks and optionally if
        `config.is_encoder_decoder=True` in the cross-attention blocks) that can be used (see `past_key_values`
        input) to speed up sequential decoding.
    image_embeds (`torch.FloatTensor` of shape `(batch_size, latent_query_num, hidden_size)`, *optional*):
        Sequence of hidden-states at the output of `Kosmos2ImageToTextProjection`.
    projection_attentions (`tuple(torch.FloatTensor)`, *optional*):
        Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
        sequence_length)`.

        Attentions weights given by `Kosmos2ImageToTextProjection`, after the attention softmax, used to compute
        the weighted average in the self-attention heads.
    vision_model_output (`BaseModelOutputWithPooling`, *optional*):
        The output of the [`Kosmos2VisionModel`].
    Nlast_hidden_statepast_key_valueshidden_states
attentionsimage_embedsprojection_attentionsvision_model_outputreturnc                 ^     t           fd                                 D                       S )Nc              3   t   K   | ]2}|d vr|         n!t          |                                          V  3dS )text_model_outputrZ   Ngetattrto_tuple.0kselfs     r4   	<genexpr>z.Kosmos2ModelOutput.to_tuple.<locals>.<genexpr>   c       
 
  LLLDGGRYZ^`aRbRbRkRkRmRm
 
 
 
 
 
r6   tuplekeysrf   s   `r4   rb   zKosmos2ModelOutput.to_tuple   C     
 
 
 
YY[[
 
 
 
 
 	
r6   )__name__
__module____qualname____doc__rT   r   r,   FloatTensor__annotations__rU   r   rV   rj   rW   rX   rY   rZ   r   r   rb    r6   r4   rS   rS   ]   s          & 6:x 12999'+OXe_+++8<M8E%"345<<<59Ju01299904L(5,-444@D8E%*;$<=DDD6:3:::
%* 
 
 
 
 
 
r6   rS   zC
    Model output class for `Kosmos2ForConditionalGeneration`.
    c                   P   e Zd ZU dZdZeej                 ed<   dZ	eej                 ed<   dZ
ee         ed<   dZeeej                          ed<   dZeeej                          ed<   dZeej                 ed<   dZeeej                          ed	<   dZeed
<   dee         fdZdS )*Kosmos2ForConditionalGenerationModelOutputa*  
    loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
        Language modeling loss (for next-token prediction).
    logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
        Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
    past_key_values (`Cache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
        It is a [`~cache_utils.Cache`] instance. For more details, see our [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache).

        Contains pre-computed hidden-states (key and values in the self-attention blocks and optionally if
        `config.is_encoder_decoder=True` in the cross-attention blocks) that can be used (see `past_key_values`
        input) to speed up sequential decoding.
    image_embeds (`torch.FloatTensor` of shape `(batch_size, latent_query_num, hidden_size)`, *optional*):
        Sequence of hidden-states at the output of `Kosmos2ImageToTextProjection`.
    projection_attentions (`tuple(torch.FloatTensor)`, *optional*):
        Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
        sequence_length)`.

        Attentions weights given by `Kosmos2ImageToTextProjection`, after the attention softmax, used to compute
        the weighted average in the self-attention heads.
    vision_model_output (`BaseModelOutputWithPooling`, *optional*):
        The output of the [`Kosmos2VisionModel`].
    NlosslogitsrU   rV   rW   rX   rY   rZ   r[   c                 ^     t           fd                                 D                       S )Nc              3   t   K   | ]2}|d vr|         n!t          |                                          V  3dS r^   r`   rc   s     r4   rg   zFKosmos2ForConditionalGenerationModelOutput.to_tuple.<locals>.<genexpr>   rh   r6   ri   rl   s   `r4   rb   z3Kosmos2ForConditionalGenerationModelOutput.to_tuple   rm   r6   )rn   ro   rp   rq   rw   r   r,   rr   rs   rx   rU   r   rV   rj   rW   rX   rY   rZ   r   r   rb   rt   r6   r4   rv   rv      s         . )-D(5$
%,,,*.FHU&'...'+OXe_+++8<M8E%"345<<<59Ju01299904L(5,-444@D8E%*;$<=DDD6:3:::
%* 
 
 
 
 
 
r6   rv   c                   v     e Zd Zdef fdZdej        dededej        fdZdd	ej	        dej        fd
Z
 xZS )Kosmos2VisionEmbeddingsconfigc                 z   t                                                       || _        |j        | _        |j        | _        |j        | _        t          j        t          j
        | j                            | _        t          j        |j        | j        | j        | j        d          | _        | j        | j        z  dz  | _        | j        dz   | _        t          j        | j        | j                  | _        |                     dt          j        | j                                      d          d           d S )NF)in_channelsout_channelskernel_sizestridebias   r   position_ids)r   r;   
persistent)super__init__r}   hidden_size	embed_dim
image_size
patch_sizer   	Parameterr,   randnclass_embeddingConv2dnum_channelspatch_embeddingnum_patchesnum_positions	Embeddingposition_embeddingregister_bufferr@   r)   rf   r}   	__class__s     r4   r   z Kosmos2VisionEmbeddings.__init__   s   + + +!|EK,G,GHH!y+? 
  
  
 !Ot>1D!-1"$,t/A4>"R"R^U\$:L-M-M-T-TU\-]-]jopppppr6   
embeddingsheightwidthr[   c                    |j         d         dz
  }| j        j                            d          }|j         d         dz
  }t          j                                        s&||k    r ||k    r|                     | j                  S |ddddf         }|ddddf         }|j         d         }	|| j        z  }
|| j        z  }t          |dz            }|
                    d|||	          }|                    dddd          }t          j                            ||
|fdd	
          }|                    dddd                              dd|	          }t	          j        ||fd          S )a   
        This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher resolution
        images. This method is also adapted to support torch.jit tracing.

        Adapted from:
        - https://github.com/facebookresearch/dino/blob/de9ee3df6cf39fac952ab558447af1fa1365362a/vision_transformer.py#L174-L194, and
        - https://github.com/facebookresearch/dinov2/blob/e1277af2ba9496fbadf7aec6eba56e8d882d1e35/dinov2/models/vision_transformer.py#L179-L211
        r   r   Nr;   g      ?r	   r   bicubicF)r(   modealign_cornersr=   )shaper   weight	unsqueezer,   jit
is_tracingr   r   r   reshapepermuter   
functionalinterpolaterB   rC   )rf   r   r   r   r   r   r   class_pos_embedpatch_pos_embedr>   
new_height	new_widthsqrt_num_positionss                r4   interpolate_pos_encodingz0Kosmos2VisionEmbeddings.interpolate_pos_encoding   s    !&q)A-!4;EEaHH*03a7 y##%% 	>+*F*F6UZ??**4+<===,QQQU3,QQQU3r"t.
T_,	&}c'9::)11!5GI[]`aa)11!Q1==-33i(	 4 
 
 *11!Q1==BB1b#NNy/?;CCCCr6   Fpixel_valuesc                 <   |j         \  }}}}|s<|| j        k    s|| j        k    r&t          d| d| d| j         d| j         d	          | j        j        j        }|                     |                    |                    }|                    d                              dd          }| j	        
                    |dd          }	t          j        |	|gd	          }
|r|
|                     |
||          z   }
n|
|                     | j                  z   }
|
S )
NzInput image size (*z) doesn't match model ().r$   r   r   r;   r=   )r   r   
ValueErrorr   r   r$   r*   flatten	transposer   r)   r,   rC   r   r   r   )rf   r   r   
batch_size_r   r   target_dtypepatch_embedsclass_embedsr   s              r4   forwardzKosmos2VisionEmbeddings.forward   sD   '3'9$
Avu' 	Vt-F-F%SWSbJbJbqVqqeqqDOqq^b^mqqq   +28++LOO,O,O,OPP#++A..88A>>+22:q"EEYl;CCC
# 	Q#d&C&CJPVX]&^&^^JJ#d&=&=d>O&P&PPJr6   F)rn   ro   rp   r"   r   r,   TensorrI   r   rr   r   __classcell__r   s   @r4   r|   r|      s        q2 q q q q q q,'D5< 'D 'DUX 'D]b]i 'D 'D 'D 'DR E$5 Z_Zf        r6   r|           modulequerykeyvalueattention_maskscalingdropoutc                 z   t          j        ||                    dd                    |z  }|||z   }t          j                            |d          }t          j                            ||| j                  }t          j        ||          }	|	                    dd                                          }	|	|fS )Nr;   r=   ptrainingr   r   )	r,   matmulr   r   r   softmaxr   r   
contiguous)
r   r   r   r   r   r   r   kwargsattn_weightsattn_outputs
             r4   eager_attention_forwardr   	  s     <s}}R'<'<==GL!#n4=((2(>>L=((6?([[L,|U33K''1--88::K$$r6   c                        e Zd ZdZ fdZ	 	 	 ddej        deej                 deej                 dee         d	e	ej        eej                 f         f
d
Z
 xZS )Kosmos2VisionAttention=Multi-headed attention from 'Attention Is All You Need' paperc                    t                                                       || _        |j        | _        |j        | _        | j        | j        z  | _        | j        | j        z  | j        k    r t          d| j         d| j         d          | j        dz  | _	        |j
        | _        d| _        t          j        | j        | j                  | _        t          j        | j        | j                  | _        t          j        | j        | j                  | _        t          j        | j        | j                  | _        d S )N;embed_dim must be divisible by num_heads (got `embed_dim`:  and `num_heads`: r         F)r   r   r}   r   r   num_attention_heads	num_headshead_dimr   scaleattention_dropoutr   	is_causalr   Lineark_projv_projq_projout_projr   s     r4   r   zKosmos2VisionAttention.__init__"  s   +3$.8=4>)T^;;'dn ' 'N' ' '   ]D(
/i??i??i??	$.$.AAr6   NFrV   r   causal_attention_maskoutput_attentionsr[   c           
      n   |j         \  }}}|                     |          }|                     |          }	|                     |          }
|                    ||| j        | j                                      dd          }|	                    ||| j        | j                                      dd          }	|
                    ||| j        | j                                      dd          }
| j        j	        dk    r||||z   }n||}n	|du| _
        t          }| j        j	        dk    rt          | j        j	                 } || ||	|
|| j
        | j        | j        sdn| j                  \  }}|                    |||                                          }|                     |          }|sd}||fS )#Input shape: Batch x Time x Channelr   r   flash_attention_2Neagerr   )r   r   r   )r   r   r   r   rB   r   r   r   r}   _attn_implementationr   r   r   r   r   r   r   r   r   )rf   rV   r   r   r   r   
seq_lengthr   queriesrk   valuesattention_interfacer   r   s                 r4   r   zKosmos2VisionAttention.forward6  s    -:,?)
J	++m,,{{=))]++,,z:t~t}UU__`acdeeyyZOOYYZ[]^__ZT^T]SS]]^_abcc ;+/BBB).C.O!/2G!G&2!62$>DN(?;+w66"9$+:Z"[$7$7nJ#}>CC$,	%
 	%
 	%
!\ "))*j)LLWWYYmmK00  	 LL((r6   )NNF)rn   ro   rp   rq   r   r,   r   r   r-   rj   r   r   r   s   @r4   r   r     s        GGB B B B B. 268<,1/) /)|/) !./)  (5	/)
 $D>/) 
u|Xel33	4/) /) /) /) /) /) /) /)r6   r   c                   B     e Zd Z fdZdej        dej        fdZ xZS )Kosmos2VisionMLPc                    t                                                       || _        t          |j                 | _        t          j        |j        |j	                  | _
        t          j        |j	        |j                  | _        d S N)r   r   r}   r
   
hidden_actactivation_fnr   r   r   intermediate_sizefc1fc2r   s     r4   r   zKosmos2VisionMLP.__init__j  sf    #F$569V/1IJJ9V5v7IJJr6   rV   r[   c                     |                      |          }|                     |          }|                     |          }|S r   )r   r   r   rf   rV   s     r4   r   zKosmos2VisionMLP.forwardq  s=    //**=99//r6   )rn   ro   rp   r   r,   r   r   r   r   s   @r4   r   r   i  sc        K K K K KU\ el        r6   r   c                        e Zd Zdef fdZ	 d
dej        dej        dej        dee         de	ej
                 f
d	Z xZS )Kosmos2VisionEncoderLayerr}   c                 D   t                                                       |j        | _        t	          |          | _        t          j        | j        |j                  | _	        t          |          | _        t          j        | j        |j                  | _        d S Neps)r   r   r   r   r   	self_attnr   	LayerNormlayer_norm_epslayer_norm1r   mlplayer_norm2r   s     r4   r   z"Kosmos2VisionEncoderLayer.__init__z  s    +/77<F<QRRR#F++<F<QRRRr6   FrV   r   r   r   r[   c                     |}|                      |          }|                     ||||          \  }}||z   }|}|                     |          }|                     |          }||z   }|f}|r||fz  }|S )aI  
        Args:
            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
            attention_mask (`torch.FloatTensor`): attention mask of size
                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
                `(config.encoder_attention_heads,)`.
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
        )rV   r   r   r   )r  r  r
  r	  )rf   rV   r   r   r   residualr   outputss           r4   r   z!Kosmos2VisionEncoderLayer.forward  s    " !((77&*nn')"7/	 '5 '
 '
#| !=0 ((77// =0 " 	'&Gr6   r   )rn   ro   rp   r"   r   r,   r   r   r-   rj   rr   r   r   r   s   @r4   r   r   y  s        S2 S S S S S S -2& &|& &  %|	&
 $D>& 
u 	!& & & & & & & &r6   r   c                        e Zd ZdZdef fdZe	 	 	 	 	 ddeej	                 deej	                 dee
         dee
         d	ee
         d
eeef         fd            Z xZS )Kosmos2VisionEncoderz
    Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a
    [`Kosmos2VisionEncoderLayer`].

    Args:
        config: Kosmos2VisionConfig
    r}   c                     t                                                       | _        t          j        fdt          j                  D                       | _        d| _        d S )Nc                 .    g | ]}t                    S rt   )r   )rd   r   r}   s     r4   
<listcomp>z1Kosmos2VisionEncoder.__init__.<locals>.<listcomp>  s"    $p$p$p1%>v%F%F$p$p$pr6   F)	r   r   r}   r   
ModuleListrangenum_hidden_layerslayersgradient_checkpointingr   s    `r4   r   zKosmos2VisionEncoder.__init__  sb    m$p$p$p$pPUV\VnPoPo$p$p$pqq&+###r6   Nr   r   r   output_hidden_statesreturn_dictr[   c                 @   ||n| j         j        }||n| j         j        }||n| j         j        }|rdnd}|rdnd}|}	t	          | j                  D ]2\  }
}|r||	fz   } ||	|||          }|d         }	|r||d         fz   }3|r||	fz   }t          |	||          S )a  
        Args:
            inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
                Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
                This is useful if you want more control over how to convert `input_ids` indices into associated vectors
                than the model's internal embedding lookup matrix.
            attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

                - 1 for tokens that are **not masked**,
                - 0 for tokens that are **masked**.

                [What are attention masks?](../glossary#attention-mask)
            causal_attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
                Causal mask for the text model. Mask values selected in `[0, 1]`:

                - 1 for tokens that are **not masked**,
                - 0 for tokens that are **masked**.

                [What are attention masks?](../glossary#attention-mask)
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
            output_hidden_states (`bool`, *optional*):
                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
                for more detail.
            return_dict (`bool`, *optional*):
                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
        Nrt   )r   r   r   )rT   rV   rW   )r}   r   r  use_return_dict	enumerater  r   )rf   inputs_embedsr   r   r   r  r  encoder_statesall_attentionsrV   idxencoder_layerlayer_outputss                r4   r   zKosmos2VisionEncoder.forward  s    N 2C1N--TXT_Tq$8$D  $+Jj 	 &1%<kk$+B]3=0:d%"+DK"8"8 	F 	FC# C!/=2B!B)M%"3	  M *!,M  F!/=3C2E!E 	?+}.>>N+>Vd
 
 
 	
r6   )NNNNN)rn   ro   rp   rq   r"   r   r   r   r,   r   r-   r   rj   r   r   r   r   s   @r4   r  r    s         ,2 , , , , , ,  268<,0/3&*D
 D
 !.D
  (5	D

 $D>D
 'tnD
 d^D
 
uo%	&D
 D
 D
 D
 D
 D
 D
 D
r6   r  c                        e Zd Zdef fdZ	 	 	 	 	 ddeej                 dee         dee         ded	ee         d
e	e
ef         fdZ xZS )Kosmos2VisionTransformerr}   c                 4   t                                                       || _        |j        }t	          |          | _        t          j        ||j                  | _	        t          |          | _        t          j        ||j                  | _        d S r  )r   r   r}   r   r|   r   r   r  r  pre_layrnormr  encoderpost_layernorm)rf   r}   r   r   s      r4   r   z!Kosmos2VisionTransformer.__init__  s    &	1&99L8MNNN+F33 l9&:OPPPr6   NFr   r   r  r   r  r[   c                    ||n| j         j        }||n| j         j        }||n| j         j        }|t	          d          |                     ||          }|                     |          }|                     ||||          }|d         }|d d dd d f         }	|                     |	          }	|s||	f|dd          z   S t          ||	|j
        |j                  S )Nz You have to specify pixel_values)r   )r  r   r  r  r   r   )rT   pooler_outputrV   rW   )r}   r   r  r  r   r   r&  r'  r(  r   rV   rW   )
rf   r   r   r  r   r  rV   encoder_outputsrT   pooled_outputs
             r4   r   z Kosmos2VisionTransformer.forward  s/    2C1N--TXT_Tq$8$D  $+Jj 	 &1%<kk$+B]?@@@Oghh))-88,,'/!5#	 ' 
 
 ,A.)!!!Q'2++M:: 	L%}58KKK)/')7&1	
 
 
 	
r6   NNNFN)rn   ro   rp   r"   r   r   r,   rr   r-   r   rj   r   r   r   r   s   @r4   r$  r$    s        Q2 Q Q Q Q Q Q 59,0/3).&*'
 '
u01'
 $D>'
 'tn	'

 #''
 d^'
 
u00	1'
 '
 '
 '
 '
 '
 '
 '
r6   r$  c                   &    e Zd ZdZddededee         f fdZddededee         fdZeddededee         fd	            Z	 e
j                    	 	 	 	 ddee
j                 dee
j                 dedee
j                 fd            Zd Z xZS )(Kosmos2TextSinusoidalPositionalEmbeddingzDThis module produces sinusoidal positional embeddings of any length.Nr   embedding_dimrN   c                     t                                                       d| _        || _        || _        |                     || j        z   ||           d S )Nr   )r   r   offsetr0  rN   make_weights)rf   r   r0  rN   r   s       r4   r   z1Kosmos2TextSinusoidalPositionalEmbedding.__init__?  sU    *&-$+5}kRRRRRr6   num_embeddingsc                     |                      |||          }t          | d          r+|                    | j        j        | j        j                  }|                     d|d           d S )Nweightsr<   Fr   )get_embeddinghasattrr*   r6  r$   r8   r   )rf   r4  r0  rN   emb_weightss        r4   r3  z5Kosmos2TextSinusoidalPositionalEmbedding.make_weightsG  sl    ((TT4## 	_%..t|/A$,J].^^KYFFFFFr6   c                    |dz  }t          j        d          |dz
  z  }t          j        t          j        |t          j                                                  | z            }t          j        | t          j                                                                      d          |                    d          z  }t          j        t          j	        |          t          j
        |          gd                              | d          }|dz  dk    r+t          j        |t          j        | d          gd          }|	d||ddf<   |                    t          j                              S )	z
        Build sinusoidal embeddings.

        This matches the implementation in tensor2tensor, but differs slightly from the description in Section 3.5 of
        "Attention Is All You Need".
        r   i'  r   r   r   r=   r;   N)mathlogr,   expr@   int64floatr   rC   sincosrB   rD   r*   get_default_dtype)r4  r0  rN   half_dimembs        r4   r7  z6Kosmos2TextSinusoidalPositionalEmbedding.get_embeddingO  s?    !A%huooA.iXU[AAAGGIISDPQQl>===CCEEOOPQRRUXUbUbcdUeUeei338a@@@EEnVXYY1!!)S%+na"@"@AqIIIC""#CQQQvve-//000r6   r   rM   r  r9   r   c                 |   |H|                                 \  }}|.t          || j        |                              |j                  }n7|                                 d d         \  }}||                     ||          }| j        dz   |z   |z   }|| j                             d          k    r)|                     || j        z   | j	        | j                   | j        
                    d|                    d                                        ||| j        j        d                                                   S )Nr;   r   r   )r(   rP   rN   r*   r8   &create_position_ids_from_inputs_embedsr6  r3  r2  r0  index_selectrB   r   detach)rf   rM   r  r9   r   r0   seq_lenmax_poss           r4   r   z0Kosmos2TextSinusoidalPositionalEmbedding.forwarde  s8     $>>++LC#At/1G   "Y%&&  )--//4LC##JJ=Zpqq "Q&03IIT\&&q))))g3T5GIYZZZ|((L,=,=b,A,ABBGGWVZVbVhikVlmmttvvvr6   c                 8   |                                 dd         }|d         }t          j        | j        dz   || j        z   dz   t          j        |j                  }|                    d                              |                                          |z   S )z
        We are provided embeddings directly. We cannot infer which are padded so just generate sequential position ids.

        Args:
            inputs_embeds: torch.Tensor

        Returns: torch.Tensor
        Nr;   r   r<   r   )	r(   r,   r@   rN   rL   r8   r   r)   r   )rf   r  r9   input_shapesequence_lengthr   s         r4   rF  zOKosmos2TextSinusoidalPositionalEmbedding.create_position_ids_from_inputs_embeds  s     $((**3B3/%a.|q /D4D"Dq"HPUPZcpcw
 
 
 %%a((//<<GGIILbbbr6   r   )NNr   N)rn   ro   rp   rq   rI   r   r   r3  staticmethodr7  r,   no_gradr   r   rF  r   r   s   @r4   r/  r/  ;  s}       NNS Sc S# SHUXM S S S S S SG G3 Gs GQYZ]Q^ G G G G 1 1c 1# 1HUXM 1 1 1 \1( U]__ -104&'/3w wEL)w  -w !$	w
 u|,w w w _w6c c c c c c cr6   r/  c                       e Zd ZdZ	 	 	 	 	 ddededed	ee         d
ee         dee         dee         f fdZ e	ddd          	 	 	 	 	 	 dde
j        dee
j                 dee         dee
j                 dee
j                 dedee
j                 dee
j        ee
j                 ee         f         fd            Z xZS )KosmosTextAttentionr   r   FTNr   r   r   
is_decoderadd_inner_attn_layernormr   	layer_idxc	                 `   t                                                       || _        || _        || _        || _        ||z  | _        | j        |z  | j        k    rt          d| j         d| d          | j        dz  | _        || _	        || _
        t          j        |||          | _        t          j        |||          | _        t          j        |||          | _        t          j        |||          | _        d | _        |r"t          j        ||j                  | _        d S d S )Nr   r   r   r   )r   r  )r   r   r}   r   r   r   r   r   r   rR  rT  r   r   r   r   r   r   inner_attn_lnr  r  )
rf   r}   r   r   r   rR  rS  r   rT  r   s
            r4   r   zKosmosTextAttention.__init__  sK    	""!Y.MI%$.883dn 3 3%.3 3 3   }d*$"i	94@@@i	94@@@i	94@@@	)YTBBB "# 	T!#iV=R!S!S!SD	T 	Tr6   past_key_valuerU   4.58new_nameversionrV   encoder_hidden_statesr   layer_head_maskr   cache_positionr[   c                    |du}	|j         dd         \  }
}|                     |          }|                    |
|| j        | j                                      dd          }d}|Ht          |t                    r1|j        	                    | j
                  }|	r|j        }n
|j        }n|}|	r|n|}|	r3|1|r/|j        | j
                 j        }|j        | j
                 j        }n|                     |          }|                     |          }|                    |
d| j        | j                                      dd          }|                    |
d| j        | j                                      dd          }|N|	s|nd}|                    ||| j
        d|i          \  }}|	r$t          |t                    rd|j        | j
        <   t&          }| j        j        dk    rt,          | j        j                 } || ||||f| j        sd	n| j        | j        d
|\  }}|                    |
|d                                          }| j        |                     |          }|                     |          }||fS )r   Nr   r   Fr;   r^  Tr   r   )r   r   )r   r   rB   r   r   r   
isinstancer   
is_updatedgetrT  cross_attention_cacheself_attention_cacher  rk   r   r   r   updater   r}   r   r   r   r   r   r   r   rV  r   )rf   rV   r\  rU   r   r]  r   r^  r   is_cross_attentionr   r   query_statesra  curr_past_key_valuecurrent_states
key_statesvalue_statesr   r   r   s                        r4   r   zKosmosTextAttention.forward  s     3$>!.!4RaR!8
J{{=11#((ZQUQ^__iijkmnoo
&/+>?? 6,7;;DNKK
% O*9*O''*9*N''&5#2DW..- 	F/"=*"=,3DNCHJ.5dnELLL^44J;;~66L#RWWaabcefggJ',,ZT^T][[eefgijkkL*7I!St+>+E+Edn?OQ_>`, ,(
L & F*_FY*Z*Z FAEO.t~>(?;+w66"9$+:Z"[$7$7	%
  $}>CC$,L	%
 	%
 	%
 	%
!\ "))*j"EEPPRR),,[99KmmK00L((r6   )r   FFTN)NNNNFN)rn   ro   rp   rq   rI   r?  r   r-   r   r   r,   r   r   rj   r   r   r   s   @r4   rQ  rQ    s       GG %*38#$(#T #T #T 	#T
 #T TN#T #+4.#T tn#T D>#T #T #T #T #T #TJ _%0A6RRR 9=+/1526"'15L) L)|L)  (5L) "%	L)
 !.L) "%,/L)  L) !.L) 
u|Xel3Xe_D	EL) L) L) SRL) L) L) L) L)r6   rQ  c                   *     e Zd Zdef fdZd Z xZS )Kosmos2TextFFNr}   c                    t                                                       |j        | _        t          |j                 | _        |j        | _        t          j        |j	        |j
                  | _        t          j        |j
        |j	                  | _        t          j        |j
        |j                  | _        d S r  )r   r   r   r
   activation_functionr   activation_dropoutr   r   r   ffn_dimr   r   r  r  ffn_layernormr   s     r4   r   zKosmos2TextFFN.__init__  s    ~#F$>?"(";9V-v~>>9V^V-=>>\&.f>STTTr6   c                 Z   |                      |                     |                    }t          j                            || j        | j                  }|                     |          }|                     |          }t          j                            || j        | j                  }|S )Nr   )	r   r   r   r   r   rp  r   rr  r   r   s     r4   r   zKosmos2TextFFN.forward  s    **488M+B+BCC--mt?Vaean-oo**=99//--mt|VZVc-ddr6   )rn   ro   rp   r!   r   r   r   r   s   @r4   rm  rm    s[        
U0 
U 
U 
U 
U 
U 
U      r6   rm  c                       e Zd Zddef fdZ eddd          	 	 	 	 	 	 	 	 	 dd
ej        deej                 deej                 deej                 deej                 deej                 dee	         dee
         dee
         deej                 deej        eeej        ej        f                  f         fd            Z xZS )Kosmos2TextBlockNr}   c           	      4   t                                                       |j        | _        t          || j        |j        |j        dd|          | _        |j        | _        t          j	        | j        |j
                  | _        |j        rOt          || j        |j        |j        dd|          | _        t          j	        | j        |j
                  | _        t          |          | _        t          j	        | j        |j
                  | _        d S )NT)r   r   r   rR  rS  rT  r  F)r   r   r   rQ  attention_headsr   r  r   r   r  r  self_attn_layer_normadd_cross_attentionencoder_attnencoder_attn_layer_normrm  ffnfinal_layer_norm)rf   r}   rT  r   s      r4   r   zKosmos2TextBlock.__init__$  s   ),n,,%)
 
 
 ~$&LVEZ$[$[$[!% 
	c 3. 00).#! ! !D ,.<FLa+b+b+bD(!&)) "T^AV W W Wr6   rW  rU   rX  rY  FTrV   r   r\  encoder_attention_maskr]  cross_attn_layer_head_maskr   	use_cacher^  r[   c                 L   |}|                      |          } | j        d||||||
d|\  }}t          j                            || j        | j                  }||z   }d }|t          | d          st          d|  d          |}|                     |          } | j	        d|||||||
d|\  }}t          j                            || j        | j                  }||z   }|}| 
                    |          }|                     |          }||z   }|f}|r|||fz  }|S )N)rV   rU   r   r]  r   r^  r   rz  z'If `encoder_hidden_states` are passed, z` has to be instantiated with cross-attention layers by setting `config.add_cross_attention=True`)rV   r\  r   r]  rU   r   r^  rt   )rx  r  r   r   r   r   r8  r   r{  rz  r}  r|  )rf   rV   r   r\  r~  r]  r  rU   r   r  r^  r   r  self_attn_weightscross_attn_weightsr  s                   r4   r   zKosmos2TextBlock.forwardC  s    !11-@@+94> ,
'+)+/),
 ,
 ,
 ,
(( --mt|VZVc-dd =0 " ,400  Dd D D D  
 %H 88GGM0A0A 	1+&;5 : /"3-	1 	1 	1 	1-M- M11-4<Z^Zg1hhM$}4M !--m<< // =0 " 	?)+=>>Gr6   r   )	NNNNNNFTN)rn   ro   rp   r!   r   r   r,   r   r   r   r-   rj   rr   r   r   r   s   @r4   ru  ru  #  s       X X0 X X X X X X> _%0A6RRR 268<9=26=A+/,1$(15C C|C !.C  (5	C
 !) 6C "%,/C %-U\$:C "%C $D>C D>C !.C 
u (51BEDU1U+V"WW	XC C C SRC C C C Cr6   ru  c            '       j    e Zd ZdZdef fdZd Z	 	 	 	 	 ddeej	                 deej	                 d	eej	                 d
e
deej	                 f
dZ	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 ddeej	                 deej	                 deej	                 deej	                 deej	                 deej	                 deej	                 deej	                 dee         deej	                 deej	                 dee         dee         dee         dee         deej	                 dee         deeef         f$dZ xZS )Kosmos2TextTransformerz
    Transformer decoder consisting of `config.layers` layers. Each layer is a [`Kosmos2TextBlock`].

    Args:
        config: Kosmos2TextConfig
    r}   c                 <   t                                                       | _        j        | _        j        | _        j        rt          j        j                  nd| _	        t          j        j        j        j                  | _        t          j        j        j                  | _        t          j        fdt'          j                  D                       | _        t          j        j        j                  | _        d| _        d S )Nr'   )rN   )r   r0  rN   c                 2    g | ]}t          |           S ))rT  )ru  )rd   ir}   s     r4   r  z3Kosmos2TextTransformer.__init__.<locals>.<listcomp>  s'    $i$i$iq%5f%J%J%J$i$i$ir6   F)r   r   r}   r   	layerdropscale_embeddingr;  sqrtr   embed_scaler   r   
vocab_sizepad_token_idembed_tokensr/  max_position_embeddingsembed_positionsr  r  r  r  r  
layer_normr  r   s    `r4   r   zKosmos2TextTransformer.__init__  s    ~):@:PY49V%5666VYL):F<LZ`ZmnnnG 8 *+ 
  
  
 m$i$i$i$iTYZ`ZgThTh$i$i$ijj,v'79NOO&+###r6   c                     d }|d         dk    rt          ||j        |j        |          }|>t          ||j        |d                                       |j                  }||n||z   }|S )Nr;   r   )r8   r9   r%   )rF   r$   r8   r5   r*   )rf   r   rL  r  r9   combined_attention_maskexpanded_attn_masks          r4   _prepare_decoder_attention_maskz6Kosmos2TextTransformer._prepare_decoder_attention_mask  s     #'r?Q&7#$+'=	' ' '# %!-nm>Q[fgi[j!k!k!k!n!n$" " '>&E""K]`wKw $ '&r6   Nr   r  rX   img_input_maskr9   r   c                    ||                      |          }|b|                    |j                                      d|                    d                    ||                    t
          j                  <   || j        z  }|                     ||||          }|                    |j                  }||z   }t          j
                            || j        | j                  }|S )Nr;   r   )rM   r  r9   r   r   )r  r*   r8   rB   r(   r,   r-   r  r  r   r   r   r   )	rf   rM   r  rX   r  r9   r   	positionsrV   s	            r4   forward_embeddingz(Kosmos2TextTransformer.forward_embedding  s       --i88M#AMQ^QeAfAfAkAkL%%b))B BM.++%*+==> &(88 (('#9%	 ) 
 
	 LL!566	%	1--mt|VZVc-ddr6   rM   r   image_embeds_position_maskr\  r~  	head_maskcross_attn_head_maskrU   r  r   r  r  r^  r   r[   c                    ||n| j         j        }||n| j         j        }||n| j         j        }||
t	          d          |$|j        }|                    d|d                   }n.|
|
                                d d         }nt	          d          | j        r%| j	        r|rt                              d           d}|rO|	M|6t          t          | j                   t          | j                             nt          | j                   }	|rCt          |	t                    r.t                              d           t          j        |	          }	|	|	                                nd}|dk    rd }d }|                     ||
||||	          }|                     ||||          }||t)          ||
j        |d         
          }t,          j                            || j        | j	                  }|rdnd }|rdnd }|r|dnd }t3          ||gddg          D ]z\  }}|s|                                d         t5          | j                  k    rCt	          d| dt5          | j                   d|                                d          d          {t9          | j                  D ]\  }}|r||fz  }| j	        r t;          j        g           }|| j        k     r4 ||||f||||         nd |||         nd |	|||d|}|d         }|r||d         fz  }|||d         fz  }|                      |          }|r||fz  }tC          ||	|||          S )NzDYou cannot specify both input_ids and inputs_embeds at the same timer;   z5You have to specify either input_ids or inputs_embedszZ`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...F)r}   zPassing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.58.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.r   )rM   r  rX   r  r9   r   r  r   rt   r  r  zThe `z` should be specified for z layers, but it is for .)r~  r]  r  rU   r   r  r^  r   r   )rT   rU   rV   rW   cross_attentions)"r}   r   r  r  r   r   rB   r(   r  r   loggerwarning_oncer   r   r`  rj   from_legacy_cacheget_seq_lengthr  r  r5   r$   r   r   r   ziplenr  r  r,   randr  r  r   )rf   rM   r   rX   r  r\  r~  r  r  rU   r  r   r  r   r  r  r^  r   rL  r9   rV   all_hidden_statesall_self_attnsall_cross_attentions	attn_mask	mask_namer   decoder_layerdropout_probabilityr"  s                                 r4   r   zKosmos2TextTransformer.forward  s   ( 2C1N--TXT_Tq$8$D  $+Jj 	 "+!6IIDK<Q	 ]%>cddd"#/K!r;r?;;II&',,..ss3KKTUUU& 	"4= 	" "##p   "	 	0 )4 $L$D$D$DlZ^ZeFfFfFfggg!555 
  	UOU;; 	U\  
 2COTTOETE`!?!?!A!A!Afg "A%%L)-&..'%5#9% / 
 
 ==K8N
 

 !,1G1S%12H-J]grsugv%w%w%w"--mt|VZVc-dd #7@BBD0:d&7h<Q<]rrdh %(4H(IKYoKp$q$q 	 	 Iy$>>##A&3t{+;+;<<$3	 3 3SEUEU 3 3%NN,,Q/3 3 3  
 #,DK"8"8 	@ 	@C# 6!m%55!} &+jnn#&77)M% (>3<3H3dI]Ii,@,E,Eos /"3#-   M *!,M  @=#3"55(4(]1-=,??( 66   	2-!118+++%1
 
 
 	
r6   )NNNr   NNNNNNNNNNNNNNNNN)rn   ro   rp   rq   r!   r   r  r   r,   r   rI   r  r   r-   r   r   r   rj   r   r   r   r   s   @r4   r  r    sa        ,0 , , , , , ,(' ' '4 15/315&'/3! !  -! u|,	!
 !.! !$! u|,! ! ! !J -115/3=A8<9=,07;+/04/3$(,0/3&*15#M
 M
EL)M
 !.M
 u|,	M

 %-U\$:M
  (5M
 !) 6M
 EL)M
 'u|4M
 "%M
  -M
 u|,M
 D>M
 $D>M
 'tnM
  d^!M
" !.#M
$ -.%M
& 
u??	@'M
 M
 M
 M
 M
 M
 M
 M
r6   r  c                   H    e Zd ZU eed<   dZddgZdZdZdZ	de
j        fdZdS )Kosmos2PreTrainedModelr}   Tr   ru  r   c                    t          | t                    r| j        j        }n-t          | t          t
          f          r| j        j        j        }t          | t          t          f          r| j        j	        }n-t          | t          t
          f          r| j        j
        j	        }t          |t                    rt          j                            |j        d|j        dz  |z             t          j                            |j        j        |j        j        |z             t          j                            |j        j        |j        j        |z             n@t          |t*                    r|j        dz  d|j        j        z  dz  z  |z  }|j        dz  |z  }t          j                            |j        j        |           t          j                            |j        j        |           t          j                            |j        j        |           t          j                            |j        j        |           nPt          |t6                    r|j        j        dz  d|j        j        z  dz  z  |z  }d|j        j        z  dz  |z  }t          j                            |j        j        |           t          j                            |j        j        |           nt          |t>                    rt          j                            |j        j        |           t          j                            |j        j        |           t          j                            |j        j        |           t          j                            |j        j        |           nt          |t@                    rXt          j                            |j        j        |           t          j                            |j        j        |           nyt          |t                    r-t          j                            |j!        j        |           n7t          |tD                    rPt          j                            |j#        j        |           t          j                            |j$                   nt          |tJ                    rf|j&        j        j'                            d|           |j&        j(        3|j&        j        j'        |j&        j(                 )                                 nWt          |t          j*                  r=|j        j'        +                    d           |j,        j'        )                                 t          |t          j-                  r'|j,        "|j,        j'        )                                 dS dS dS )zInitialize the weightsr   r   )meanstd)r  r   Nr'   ).r`  Kosmos2VisionModelr}   initializer_factorKosmos2ModelKosmos2ForConditionalGenerationvision_configKosmos2TextModelKosmos2TextForCausalLMinit_stdtext_configr|   r   initnormal_r   r   r   r   initializer_ranger   r   r  r   r   r   r   r   r   r   r   rQ  rm  lm_headKosmos2ImageToTextProjectiondenselatent_queryr  r  datarN   zero_r  fill_r   r   )rf   r   factorr  in_proj_stdout_proj_stdfc_stds          r4   _init_weightsz$Kosmos2PreTrainedModel._init_weightsy  s   d.// 	B[3FF|-LMNN 	B[.AFd-/EFGG 	3+&CC|-LMNN 	3+)2Cf566 #	%GOOF2&BRTXBX[aBaObbbGOOF29v}?^ag?gOhhhGOOF5<&-BadjBjOkkkk 677 	%!+T1q6=;Z7Z_c6cdgmmK",d2f<LGOOFM0kOBBBGOOFM0kOBBBGOOFM0kOBBBGOOFO2OEEEE 011 	%!=4d:FMDc@chl?lmpvvK&-33<vEFGOOFJ-6O:::GOOFJ-;O???? 344 	%GOOFM0cO:::GOOFM0cO:::GOOFM0cO:::GOOFO2O<<<<// 	%GOOFJ-3O777GOOFJ-3O7777 677 	%GOOFN1sO;;;; <== 		%GOOFL/SO999GOOF/0000 677 	%&+33#3FFF".:#*/0C0OPVVXXX-- 	%M$$S)))K""$$$fbi(( 	%V[-DK""$$$$$	% 	%-D-Dr6   N)rn   ro   rp   r    rs   supports_gradient_checkpointing_no_split_modules_supports_attention_backend_supports_flash_attn_supports_sdpar   Moduler  rt   r6   r4   r  r  p  sb         &*#46HI"&N2%BI 2% 2% 2% 2% 2% 2%r6   r  c                        e Zd ZU eed<   dZdef fdZdej        fdZ	e
	 	 	 	 	 ddeej                 dee         d	ee         d
edee         deeef         fd            Z xZS )r  r}   r   c                     t                                          |           t          |          | _        |                                  d S r   )r   r   r$  model	post_initr   s     r4   r   zKosmos2VisionModel.__init__  s@       -f55
r6   r[   c                 $    | j         j        j        S r   )r  r   r   rl   s    r4   get_input_embeddingsz'Kosmos2VisionModel.get_input_embeddings  s    z$44r6   NFr   r  r   r  c                 6    |                      |||||          S )N)r   r   r  r   r  r  )rf   r   r   r  r   r  s         r4   r   zKosmos2VisionModel.forward  s.     zz%/!5%=#  
 
 	
r6   r-  )rn   ro   rp   r"   rs   main_input_namer   r   r  r  r   r   r,   rr   r-   r   rj   r   r   r   r   s   @r4   r  r    s        $O2      5bi 5 5 5 5  59,0/3).&*
 
u01
 $D>
 'tn	

 #'
 d^
 
u00	1
 
 
 ^
 
 
 
 
r6   r  c            )       "    e Zd ZU eed<   def fdZdej        fdZe	e
	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 ddeej                 deej                 deej                 d	eej                 d
eej                 deej                 deej                 deej                 dee         deej                 deej                 dee         dee         dee         dee         deej                 dee         deeef         f$d                        Z xZS )r  r}   c                     t                                          |           t          |          | _        |                                  d S r   )r   r   r  r  r  r   s     r4   r   zKosmos2TextModel.__init__  s@       +F33
r6   r[   c                     | j         j        S r   r  r  rl   s    r4   r  z%Kosmos2TextModel.get_input_embeddings      z&&r6   NrM   r   rX   r  r\  r~  r  r  rU   r  r   r  r   r  r  r^  r   c                 |     | j         di d|d|d|d|d|d|d|d|d	|	d
|
d|d|d|d|d|d||S )a  
        image_embeds (`torch.FloatTensor` of shape `(batch_size, latent_query_num, hidden_size)`, *optional*):
            Sequence of hidden-states at the output of `Kosmos2ImageToTextProjection`.
        image_embeds_position_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
            Mask to indicate the location in a sequence to insert the image features . Mask values selected in `[0,
            1]`:

            - 1 for places where to put the image features,
            - 0 for places that are not for image features (i.e. for text tokens).
        cross_attn_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
            Mask to nullify selected heads of the cross-attention modules. Mask values selected in `[0, 1]`:

            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.
        rM   r   rX   r  r\  r~  r  r  rU   r  r   r  r   r  r  r^  rt   r  )rf   rM   r   rX   r  r\  r~  r  r  rU   r  r   r  r   r  r  r^  r   s                     r4   r   zKosmos2TextModel.forward  s    J tz 
 
 
i
)>
 &
 (B'A	

 #8"7
 $:#9
  i
 "6!5
 ,O
 (-
 &
  i
 0/
 "6!5
 $
  *>#
 
 	
r6   r  )rn   ro   rp   r!   rs   r   r   r  r  r   r   r   r,   r   r   r-   r   r   r   rj   r   r   r   r   s   @r4   r  r    s        0      'bi ' ' ' '  -115/3=A8<9=,07;+/04/3$(,0/3&*15#5
 5
EL)5
 !.5
 u|,	5

 %-U\$:5
  (55
 !) 65
 EL)5
 'u|45
 "%5
  -5
 u|,5
 D>5
 $D>5
 'tn5
  d^!5
" !.#5
$ -.%5
& 
u??	@'5
 5
 5
 ^ 5
 5
 5
 5
 5
r6   r  z
    The text model from KOSMOS-2 with a language modeling head on top (linear layer with weights tied to the input
    embeddings).
    c            +       t    e Zd ZU eed<   dgZdef fdZdej        fdZ	dej        fdZ
ee	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 ddeej                 d	eej                 d
eej                 deej                 deej                 deej                 deej                 deej                 dee         deej                 deej                 deej                 dee         dee         dee         dee         deej                 dee         deeef         f&d                        Z	 	 	 	 	 	 	 d fd	Z xZS )r  r}   zlm_head.weightc                     t                                          |           t          |          | _        t	          j        |j        |j        d          | _        | 	                                 d S )NF)in_featuresout_featuresr   )
r   r   r  r  r   r   r   r  r  r  r   s     r4   r   zKosmos2TextForCausalLM.__init__  sa       +F33
yV-=FL]dijjj 	r6   r[   c                     | j         j        S r   r  rl   s    r4   r  z+Kosmos2TextForCausalLM.get_input_embeddings(  r  r6   c                     | j         S r   )r  rl   s    r4   get_output_embeddingsz,Kosmos2TextForCausalLM.get_output_embeddings+  s
    |r6   NrM   r   rX   r  r\  r~  r  r  rU   r  r   labelsr  r   r  r  r^  r   c                    ||n| j         j        }||rt                              d           d} | j        di d|d|d|d|d|d	|d
|d|d|	d|
d|d|d|d|ddd||}|                     |d                   }d}| | j        d||| j         j        d|}t          |||j	        |j
        |j        |j                  S )aK  
        image_embeds (`torch.FloatTensor` of shape `(batch_size, latent_query_num, hidden_size)`, *optional*):
            Sequence of hidden-states at the output of `Kosmos2ImageToTextProjection`.
        image_embeds_position_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
            Mask to indicate the location in a sequence to insert the image features . Mask values selected in `[0,
            1]`:

            - 1 for places where to put the image features,
            - 0 for places that are not for image features (i.e. for text tokens).
        cross_attn_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
            Mask to nullify selected heads of the cross-attention modules. Mask values selected in `[0, 1]`:

            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the left-to-right language modeling loss (next word prediction). Indices should be in
            `[-100, 0, ..., config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are
            ignored (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
        NzJThe `use_cache` argument is changed to `False` since `labels` is provided.FrM   r   rX   r  r\  r~  r  r  rU   r  r   r  r   r  r  Tr^  r   )rx   r  r  )rw   rx   rU   rV   rW   r  rt   )r}   r  r  warningr  r  loss_functionr  r   rU   rV   rW   r  )rf   rM   r   rX   r  r\  r~  r  r  rU   r  r   r  r  r   r  r  r^  r   r  	lm_logitsrw   s                         r4   r   zKosmos2TextForCausalLM.forward.  s   T &1%<kk$+B] mklllI$* 
 
 
i
)>
 &
 (B'A	

 #8"7
 $:#9
  i
 "6!5
 ,O
 (-
 &
  i
 0/
 "6!5
 
  *>#
 
& LL,,	%4%sYvRVR]RhsslrssD0#3!/)$5
 
 
 	
r6   c	                    |d         dk    rd }d }n|||                                 d d         n|                                 \  }
}|                                 d         }t          j        |t          j        |
||z
  ft          j        |j                  fd          } t                      j        |f|||||||d|	}|                    dd            |S )Nr   r;   )r(   r$   r8   r   r=   )rU   r   rX   r  r  r  r^  r   )	r(   r,   rC   rD   r-   r8   r   prepare_inputs_for_generationpop)rf   rM   rX   r  rU   r   r  r  r^  model_kwargsr   rI  mask_lenmodel_inputsr   s                 r4   r  z4Kosmos2TextForCausalLM.prepare_inputs_for_generation  s#    !!!L)-&& (3?L?X-"4"4"6"6ss";";^g^l^l^n^nJ16688<H)..Kj'H2D%EUZ`i`pqqq * * *& =uww<

+)%'A')

 

 

 

 	...r6   )NNNNNNNNNNNNNNNNN)NNNNNNN)rn   ro   rp   r!   rs   _tied_weights_keysr   r   r  r  r  r   r   r   r,   r   r   
LongTensorr-   r   r   r   rj   r   r   r  r   r   s   @r4   r  r    s         *+0      'bi ' ' ' 'ry      -115/3=A8<9=,07;+/04/3-1$(,0/3&*15%O
 O
EL)O
 !.O
 u|,	O

 %-U\$:O
  (5O
 !) 6O
 EL)O
 'u|4O
 "%O
  -O
 u|,O
 )*O
 D>O
 $D>O
  'tn!O
" d^#O
$ !.%O
& +,'O
( 
u77	8)O
 O
 O
 ^ O
h #'- - - - - - - - - -r6   r  c                   .     e Zd ZdZdef fdZd Z xZS )r  zmThe layer that transforms the image model's output to part of the text model's input (namely, image features)r}   c                    t                                                       t          j        |j        j        |j        j                  | _        t          j	        t          j        |j        |j        j                            | _        t          |j        |j        j        |j        j        |j        j        dd          | _        d S )NF)r   rR  rS  )r   r   r   r   r  r   r  r   r  r   r,   r   latent_query_numr  rQ  rw  r   x_attnr   s     r4   r   z%Kosmos2ImageToTextProjection.__init__  s    Yv3?ASA]^^
LV5LfN`Nj)k)kll)(.&8%*
 
 
r6   c                 "   |                      |          }| j                            d                              |                    d          dd          }t          j        ||gd          }|                     ||d d d           \  }}||fS )Nr   r;   r   r=   )rV   r\  rU   r   r   )r  r  r   r)   r(   r,   rC   r  )rf   featuresrV   r  key_value_statesr   s         r4   r   z$Kosmos2ImageToTextProjection.forward  s    

8,, (22155<<]=O=OPQ=R=RTVXZ[[ 9m\%BJJJ&*kk&"2 " '2 '
 '
#| l**r6   )rn   ro   rp   rq   r    r   r   r   r   s   @r4   r  r    sY        ww
} 
 
 
 
 
 
+ + + + + + +r6   r  z}
    KOSMOS-2 Model for generating text and image features. The model consists of a vision encoder and a language model.
    c            %           e Zd ZU eed<   dZdef fdZdej        fdZ	d Z
	 	 ddej        dee         d	ee         fd
Zee	 	 	 	 	 	 	 	 	 	 	 	 	 	 ddeej                 deej                 deej                 deej                 deej                 dee         deej                 deej                 deej                 dee         dee         dee         d	edee         dee         deeef         f d                        Z xZS )r  r}   r   c                     t                                          |           t          |j                  | _        t          |j                  | _        t          |          | _	        | 
                                 d S r   )r   r   r  r  
text_modelr  r  vision_modelr  image_to_text_projectionr  r   s     r4   r   zKosmos2Model.__init__  sh       *6+=>>.v/CDD(DV(L(L% 	r6   r[   c                 $    | j         j        j        S r   r   r  r  rl   s    r4   r  z!Kosmos2Model.get_input_embeddings      $11r6   c                 (    || j         j        _        d S r   r  rf   r   s     r4   set_input_embeddingsz!Kosmos2Model.set_input_embeddings      -2***r6   Freturn_attentionsr   c                     |                      ||          }| j         j                            |d                   }t          j                            |d          }|                     |          \  }}|r||fS |S )aD  
        Encodes images into continuous embeddings that can be forwarded to the language model.

        Args:
            pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)`):
                The tensors corresponding to the input images.
            return_attentions (`bool`, *optional*, defaults to `False`):
                Whether to return `projection_attentions` or not.
            interpolate_pos_encoding (`bool`, *optional*, defaults to `False`):
                Whether to interpolate positional embeddings or not.
        )r   r   r   r;   r=   )r  r  r(  r   r   	normalizer  )rf   r   r
  r   rZ   rX   rY   s          r4   get_image_featureszKosmos2Model.get_image_features  s    " #//%%= 0 
 

 (.==>QRS>TUU}..|.DD.2.K.KL.Y.Y++ 	7!666r6   NrM   r  r   r  rU   rX   r  r   r  r   r  r  r   c                 N   ||n| j         j        }||n| j         j        }||n| j         j        }d}d}|,|t	          d          |                     |d|          \  }} | j        d||||||||	|
||dd|}t          |j        |j	        |j
        |j        |||          S )aE  
        image_embeds_position_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
            Mask to indicate the location in a sequence to insert the image features . Mask values selected in `[0,
            1]`:

            - 1 for places where to put the image features,
            - 0 for places that are not for image features (i.e. for text tokens).
        image_embeds (`torch.FloatTensor` of shape `(batch_size, latent_query_num, hidden_size)`, *optional*):
            Sequence of hidden-states at the output of `Kosmos2ImageToTextProjection`.

        Examples:

        ```python
        >>> from PIL import Image
        >>> import requests
        >>> from transformers import AutoProcessor, Kosmos2Model

        >>> model = Kosmos2Model.from_pretrained("microsoft/kosmos-2-patch14-224")
        >>> processor = AutoProcessor.from_pretrained("microsoft/kosmos-2-patch14-224")

        >>> url = "https://huggingface.co/microsoft/kosmos-2-patch14-224/resolve/main/snowman.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> text = (
        ...     "<grounding> An image of<phrase> a snowman</phrase><object><patch_index_0044><patch_index_0863>"
        ...     "</object> warming himself by<phrase> a fire</phrase><object><patch_index_0005><patch_index_0911>"
        ...     "</object>"
        ... )

        >>> inputs = processor(text=text, images=image, return_tensors="pt", add_eos_token=True)

        >>> last_hidden_state = model(
        ...     pixel_values=inputs["pixel_values"],
        ...     input_ids=inputs["input_ids"],
        ...     attention_mask=inputs["attention_mask"],
        ...     image_embeds_position_mask=inputs["image_embeds_position_mask"],
        ... ).last_hidden_state
        >>> list(last_hidden_state.shape)
        [1, 91, 2048]
        ```N<You have to specify either `pixel_values` or `image_embeds`.T)r
  r   )rM   r   rX   r  r  rU   r  r   r  r   r  r  )rT   rU   rV   rW   rX   rY   rZ   rt   )r}   r   r  r  r   r  r   rS   rT   rU   rV   rW   )rf   r   rM   r  r   r  rU   rX   r  r   r  r   r  r   r  r   rZ   rY   r  s                      r4   r   zKosmos2Model.forward  s"   x 2C1N--TXT_Tq$8$D  $+Jj 	 &1%<kk$+B]" $# !_```262I2IOg 3J 3 3/L/ "$/ 
)%'A+'%/!5
 
 
 
  "%7#3!/)%"7 3
 
 
 	
r6   )FF)NNNNNNNNNNNNFN)rn   ro   rp   r    rs   r  r   r   r  r  r  r,   rr   r   r-   r  r   r   r   r   r   r   r   rj   rS   r   r   r   s   @r4   r  r    s=         $O}      2bi 2 2 2 23 3 3 -238	 ' $D> #+4.	   >  04,0=A15,0+//304/3$(,0/3).&*a
 a
u|,a
 EL)a
 %-U\$:	a

 !.a
 EL)a
 "%a
 u|,a
  -a
 u|,a
 D>a
 $D>a
 'tna
 #'a
 d^a
  -.!a
" 
u((	)#a
 a
 a
 ^ a
 a
 a
 a
 a
r6   r  z
    KOSMOS-2 Model for generating text and bounding boxes given an image. The model consists of a vision encoder and a
    language model.
    c            #           e Zd ZU eed<   dZdgZdef fdZdej	        fdZ
d Zdej	        fdZd	 Zee	 	 	 	 	 	 	 	 	 	 	 	 	 ddeej                 deej                 deej                 deej                 deej                 dee         deej                 deej                 deej                 deej                 dee         dee         dee         dee         deeef         fd                        Z ej                    	 	 	 	 	 	 ddeej                 deej                 deej                 deej                 deej                 deej                 fd            Z xZS )r  r}   r   ztext_model.lm_head.weightc                     t                                          |           t          |j                  | _        t          |j                  | _        t          |          | _	        | 
                                 d S r   )r   r   r  r  r   r  r  r  r  r  r  r   s     r4   r   z(Kosmos2ForConditionalGeneration.__init__}  sh       01CDD.v/CDD(DV(L(L% 	r6   r[   c                 $    | j         j        j        S r   r  rl   s    r4   r  z4Kosmos2ForConditionalGeneration.get_input_embeddings  r  r6   c                 (    || j         j        _        d S r   r  r  s     r4   r  z4Kosmos2ForConditionalGeneration.set_input_embeddings  r	  r6   c                 4    | j                                         S r   )r   r  rl   s    r4   r  z5Kosmos2ForConditionalGeneration.get_output_embeddings  s    44666r6   c                 :    | j                             |           d S r   )r   set_output_embeddings)rf   new_embeddingss     r4   r  z5Kosmos2ForConditionalGeneration.set_output_embeddings  s    --n=====r6   NrM   r  r   r  rU   rX   r  r   r  r  r   r  r   c                    ||n| j         j        }||n| j         j        }d}d}||t          d          |                     |||          }| j        j                            |d                   }t          j        	                    |d          }| 
                    |          \  }} | j        d
||||||||	|
|||dd|}t          |j        |j        |j        |j        |j        |||	          S )a5  
        image_embeds_position_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
            Mask to indicate the location in a sequence to insert the image features . Mask values selected in `[0,
            1]`:

            - 1 for places where to put the image features,
            - 0 for places that are not for image features (i.e. for text tokens).
        image_embeds (`torch.FloatTensor` of shape `(batch_size, latent_query_num, hidden_size)`, *optional*):
            Sequence of hidden-states at the output of `Kosmos2ImageToTextProjection`.
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the left-to-right language modeling loss (next word prediction). Indices should be in
            `[-100, 0, ..., config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are
            ignored (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`

        Examples:

        ```python
        >>> from PIL import Image
        >>> import requests
        >>> from transformers import AutoProcessor, Kosmos2ForConditionalGeneration

        >>> model = Kosmos2ForConditionalGeneration.from_pretrained("microsoft/kosmos-2-patch14-224")
        >>> processor = AutoProcessor.from_pretrained("microsoft/kosmos-2-patch14-224")

        >>> url = "https://huggingface.co/microsoft/kosmos-2-patch14-224/resolve/main/snowman.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> prompt = "<grounding> An image of"

        >>> inputs = processor(text=prompt, images=image, return_tensors="pt")

        >>> generated_ids = model.generate(
        ...     pixel_values=inputs["pixel_values"],
        ...     input_ids=inputs["input_ids"],
        ...     attention_mask=inputs["attention_mask"],
        ...     image_embeds=None,
        ...     image_embeds_position_mask=inputs["image_embeds_position_mask"],
        ...     use_cache=True,
        ...     max_new_tokens=64,
        ... )
        >>> generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
        >>> processed_text = processor.post_process_generation(generated_text, cleanup_and_extract=False)
        >>> processed_text
        '<grounding> An image of<phrase> a snowman</phrase><object><patch_index_0044><patch_index_0863></object> warming himself by<phrase> a fire</phrase><object><patch_index_0005><patch_index_0911></object>.'

        >>> caption, entities = processor.post_process_generation(generated_text)
        >>> caption
        'An image of a snowman warming himself by a fire.'

        >>> entities
        [('a snowman', (12, 21), [(0.390625, 0.046875, 0.984375, 0.828125)]), ('a fire', (41, 47), [(0.171875, 0.015625, 0.484375, 0.890625)])]
        ```Nr  )r   r   r  r   r;   r=   T)rM   r   rX   r  r  rU   r  r   r  r  r   r  r  )rw   rx   rU   rV   rW   rX   rY   rZ   rt   )r}   r   r  r   r  r  r(  r   r   r  r  r   rv   rw   rx   rU   rV   rW   )rf   r   rM   r  r   r  rU   rX   r  r   r  r  r   r  r   rZ   rY   
lm_outputss                     r4   r   z'Kosmos2ForConditionalGeneration.forward  sf   N 2C1N--TXT_Tq$8$D  $+Jj 	 # $# !_```"&"3"3)"3%9 #4 # #  ,2AABUVWBXYYL=22<R2HHL262O2OP\2]2]/L/$T_ 
)%'A+'%/!5
 
 
 

" :$&6$2!,%"7 3	
 	
 	
 		
r6   c           	         |                     dd           }||t          d| d          |||}|s|                     |          }	| j        j                            |	d                   }t
          j                            |d          }|                     |          \  }}
 | j	        j
        d|||||d|}|S )	Ninputsz
`inputs`: zp were passed alongside `pixel_values` which is not allowed.Make sure to either pass `inputs` or pixel_values=...r   r;   r=   )rM   r   rX   r  r  rt   )r  r   r  r  r(  r   r   r  r  r   generate)rf   r   r  rM   r   rX   r  r   r  rZ   rY   outputs               r4   r  z(Kosmos2ForConditionalGeneration.generate  s    Hd++#(:IV I I I   F$6!L"&"3"3L"A"A,2AABUVWBXYYL=22<R2HHL262O2OP\2]2]/L/)) 
)%'A'
 
 
 
 r6   )NNNNNNNNNNNNN)NNNNNN)rn   ro   rp   r    rs   r  r  r   r   r  r  r  r  r  r   r   r   r,   r   r   r  r-   r   r   r   rj   rv   r   rO  r  r   r   s   @r4   r  r  r  s         $O56	} 	 	 	 	 	 	2bi 2 2 2 23 3 37ry 7 7 7 7> > >  04,0=A15,0+//304/3-1$(,0/3u
 u
u|,u
 EL)u
 %-U\$:	u

 !.u
 EL)u
 "%u
 u|,u
  -u
 u|,u
 )*u
 D>u
 $D>u
 'tnu
 +,u
  
u@@	A!u
 u
 u
 ^ u
n U]__ 04=A,015/304% %u|,% %-U\$:% EL)	%
 !.% u|,%  -% % % _% % % % %r6   r  )r  r  r  r   )r   )r   )Qrq   r;  dataclassesr   typingr   r   r   r   r,   r   activationsr
   cache_utilsr   r   r   
generationr   modeling_flash_attention_utilsr   modeling_layersr   modeling_outputsr   r   r   r   modeling_utilsr   r   processing_utilsr   utilsr   r   r   r   r   r   utils.deprecationr   configuration_kosmos2r    r!   r"   
get_loggerrn   r  r   r$   rI   r5   Sizer8   rF   rP   rS   rv   r  r|   r?  r   r   r   r   r  r$  r/  rQ  rm  ru  r  r  r  r  r  r  r  r  __all__rt   r6   r4   <module>r.     s~      ! ! ! ! ! ! 1 1 1 1 1 1 1 1 1 1 1 1        ! ! ! ! ! ! C C C C C C C C C C ) ) ) ) ) ) B B B B B B 9 9 9 9 9 9            G F F F F F F F & & & & & & j j j j j j j j j j j j j j j j 0 0 0 0 0 0 X X X X X X X X X X 
	H	%	%[ [u| [EK [(3- [ [ [ [ jk\ \Z\(-\=B\\cf\ \ \ \$4 4 4 4    
 
  
  
  
  
  
  
   
F   
%
 %
 %
 %
 %
 %
 %
  %
RP P P P Pbi P P Pv % %I%<% 
% <	%
 U\*% % % % % %,F) F) F) F) F)RY F) F) F)T    ry    / / / / / : / / /fT
 T
 T
 T
 T
29 T
 T
 T
p3
 3
 3
 3
 3
ry 3
 3
 3
nUc Uc Uc Uc Ucry Uc Uc Ucpv) v) v) v) v)") v) v) v)r    RY   .d d d d d1 d d dNc
 c
 c
 c
 c
RY c
 c
 c
L :% :% :% :% :%_ :% :% :%z
 
 
 
 
/ 
 
 
BC
 C
 C
 C
 C
- C
 C
 C
L   S S S S S3_ S S Sl +  +  +  +  +29  +  +  +F   
V
 V
 V
 V
 V
) V
 V
 
V
r   { { { { {&<o { { {| X
W
Wr6   