
     `ij                     V   d Z ddlmZ ddlmZmZ ddlZddlmZ ddlm	Z	m
Z
 ddlmZ dd	lmZ dd
lmZ ddlmZ ddlmZ ddlmZmZmZmZmZ ddlmZ ddlmZ  ej        e           Z!e ed           G d de                                  Z"e ed           G d de                                  Z# G d dej$                  Z%e G d de                      Z& ed           G d de&                      Z' ed           G d  d!e&e                      Z(g d"Z)dS )#zPyTorch PaliGemmamodel.    )	dataclass)OptionalUnionN)nn   )CacheStaticCache)GenerationMixin)FlashAttentionKwargs)BaseModelOutputWithPast)PreTrainedModel)Unpack)ModelOutputTransformersKwargsauto_docstringcan_return_tuplelogging   )	AutoModel   )PaliGemmaConfigzN
    Base class for Paligemma outputs, with hidden states and attentions.
    )custom_introc                   8    e Zd ZU dZdZeej                 ed<   dS )PaligemmaModelOutputWithPasta  
    image_hidden_states (`torch.FloatTensor`, *optional*):
        A `torch.FloatTensor` of size `(batch_size, num_images, sequence_length, hidden_size)`.
        image_hidden_states of the model produced by the vision encoder and after projecting the last hidden state.
    Nimage_hidden_states)	__name__
__module____qualname____doc__r   r   torchFloatTensor__annotations__     /home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/transformers/models/paligemma/modeling_paligemma.pyr   r   +   s7           8<%"34;;;;;r$   r   zU
    Base class for PaliGemma causal language model (or autoregressive) outputs.
    c                       e Zd ZU dZdZeej                 ed<   dZ	eej                 ed<   dZ
ee         ed<   dZeeej                          ed<   dZeeej                          ed<   dZeej                 ed<   dS )	PaliGemmaCausalLMOutputWithPasta8  
    loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
        Language modeling loss (for next-token prediction).
    logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.text_config.vocab_size)`):
        Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
    past_key_values (`Cache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
        It is a [`~cache_utils.Cache`] instance. For more details, see our [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache).

        Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
        `past_key_values` input) to speed up sequential decoding.
    image_hidden_states (`torch.FloatTensor`, *optional*):
        A `torch.FloatTensor` of size `(batch_size, num_images, sequence_length, hidden_size)`.
        image_hidden_states of the model produced by the vision encoder after projecting last hidden state.
    Nlosslogitspast_key_valueshidden_states
attentionsr   )r   r   r   r   r(   r   r    r!   r"   r)   r*   r   r+   tupler,   r   r#   r$   r%   r'   r'   ;   s           )-D(5$
%,,,*.FHU&'...'+OXe_+++8<M8E%"345<<<59Ju0129997;%"34;;;;;r$   r'   c                   *     e Zd Zdef fdZd Z xZS )PaliGemmaMultiModalProjectorconfigc                     t                                                       t          j        |j        j        |j        j        d          | _        d S )NTbias)super__init__r   Linearvision_confighidden_sizeprojection_dimlinearselfr0   	__class__s     r%   r5   z%PaliGemmaMultiModalProjector.__init__Z   sB    i 4 @&BVBelpqqqr$   c                 0    |                      |          }|S N)r:   )r<   image_featuresr+   s      r%   forwardz$PaliGemmaMultiModalProjector.forward^   s    N33r$   )r   r   r   r   r5   rA   __classcell__r=   s   @r%   r/   r/   Y   sZ        r r r r r r r      r$   r/   c                   F    e Zd ZU eed<   dZdZdgZdZdZ	dZ
dZdZdZd ZdS )	PaliGemmaPreTrainedModelr0    Tr/   r*   Fc                 2   t          | j        d| j                                        j                  }t	          |t
          j                  rH|j        j        	                    d|           |j
        "|j
        j                                         d S d S d S )Ninitializer_range        )meanstd)getattrr0   get_text_configrH   
isinstancer   r6   weightdatanormal_r3   zero_)r<   modulerK   s      r%   _init_weightsz&PaliGemmaPreTrainedModel._init_weightsr   s     dk#68S8S8U8U8ghhfbi(( 	)M&&CS&999{& &&(((((	) 	)&&r$   N)r   r   r   r   r"   base_model_prefixsupports_gradient_checkpointing_no_split_modules_skip_keys_device_placement_can_compile_fullgraph_supports_flash_attn_supports_sdpa_supports_flex_attn_supports_attention_backendrT   r#   r$   r%   rE   rE   d   si         &*#78"3"N"&) ) ) ) )r$   rE   z|
    The Base Paligemma model which consists of a vision backbone and a language model without language modeling head.,
    c            #       <    e Zd ZddiZdZdef fdZd Zd Zd Z	d	 Z
	 	 	 	 	 d dee         fdZdej        fdZdej        dej        dej        fdZee	 	 	 	 	 	 	 	 	 	 	 	 	 d!deej                 deej                 deej                 deej                 dee         deej                 deej                 deej                 deej                 dee         dee         dee         dee         dee         deeef         fd                        Z xZS )"PaliGemmaModelzlanguage_model.modellanguage_modelFr0   c                    t                                          |           t          j        |j                  | _        t          |          | _        |j        j	        | _	        t          j        |j                  }|| _
        | j        j        | j        j        nd| _        | j                                        j        p| j        | _        |                                  d S )N)r0   )r4   r5   r   from_configr7   vision_towerr/   multi_modal_projectortext_config
vocab_sizer`   r0   pad_token_idrM   dtypetext_config_dtype	post_init)r<   r0   r`   r=   s      r%   r5   zPaliGemmaModel.__init__   s       %19MNNN%A&%I%I" ,7".f6HIII,8<8P8\DK44bd!%!<!<!>!>!D!R
r$   c                 4    | j                                         S r?   )r`   get_input_embeddingsr<   s    r%   rm   z#PaliGemmaModel.get_input_embeddings   s    "77999r$   c                 :    | j                             |           d S r?   )r`   set_input_embeddingsr<   values     r%   rp   z#PaliGemmaModel.set_input_embeddings   s    0077777r$   c                     || _         d S r?   r`   r<   decoders     r%   set_decoderzPaliGemmaModel.set_decoder   s    %r$   c                     | j         S r?   rt   rn   s    r%   get_decoderzPaliGemmaModel.get_decoder   s    ""r$   Nis_trainingc                 4   | j         j        j        dk    r
|d|v r|S d S ||n| j        }t	          |t
                    }t          j        | j                  j	        }||}|j
        d d         \  }	}
|r|                                }n5t	          |t          j                  r|j
        d         n|d         |
z   dz   }||                                dk    r|S t          j        |
|f|| j        |j                  }|
dk    r$|rt          j        |d	          }nd|d d d |
f<   |t          j        ||j        
          |                    dd          k    z  }|d d d d d d f                             |	ddd          }||                                }|j
        d         }|rr|t+          d          |d d d d d d d |f                             |d d d d d d f                             |j                  dk    d          |d d d d d d d |f<   |d d d d d d d |f         |d d d d d d f                             |j                  z   }|dk    }|d d d d d d d |f                             ||          |d d d d d d d |f<   |S )Nflash_attention_2rI   r   rb   r   r      
fill_valueri   devicediagonalr   z/Token type ids must be provided during training)r0   rf   _attn_implementationtrainingrN   r	   r    finforj   minshapeget_max_cache_shapeTensordimfullr   triuarangereshapeexpandclone
ValueErrormasked_fillto)r<   attention_masktoken_type_idsr*   cache_positioninput_tensorrz   using_static_cache	min_dtypeinputs_lead_dimsequence_lengthtarget_lengthcausal_maskmask_lengthpadding_masks                  r%   _update_causal_maskz"PaliGemmaModel._update_causal_mask   s\    ;"7;NNN)c^.C.C%%4%0%<kk$-'EEK 677;	)L+7+=bqb+A( 	+??AAMM nel;;=$R((#A&81<  %.*<*<*>*>!*C*C!!jm, (!(	
 
 
 a 7#jqAAA36AAA///0u|M.:OPPPSaSiSijlnoSpSppp!$aaa"23::?ArSUVV%%++--K(.r2K  !)$%VWWW5@AAAqqq,;,AV5W5c5c"111dD!!!#34778JKKqPRS6 6AAAqqq!!!\k\12
 'qqq!!!QQQ'<=qqqRVX\^_^_^_O_@`@c@cdodv@w@wwL'1,L1<QQQ111l{l=R1S1_1_i2 2K111aaa+-. r$   pixel_valuesc                     |                      |          }|j        }|                     |          }|| j        j        j        dz  z  }|S )a  
        Obtains image last hidden states from the vision tower and apply multimodal projection.

        Args:
            pixel_values (`torch.FloatTensor]` of shape `(batch_size, channels, height, width)`)
               The tensors corresponding to the input images.
        Returns:
            image_features (`torch.Tensor`): Image feature tensor of shape `(num_images, image_length, embed_dim)`).
        g      ?)rd   last_hidden_statere   r0   rf   r8   )r<   r   image_outputsselected_image_featurer@   s        r%   get_image_featuresz!PaliGemmaModel.get_image_features   sR     )),77!.!@334JKK'4;+B+NPS+STr$   	input_idsinputs_embedsr@   c                 \   |e| |                                  t          j        | j        j        t          j        |j                            k    }|                    d          }n|| j        j        k    }|                                }|	                    d          
                    |                              |j                  }|j        d         |j        d         z  }||                                         |                                k    rt          d| d|           |S )z
        Obtains multimodal placeholder mask from `input_ids` or `inputs_embeds`, and checks that the placeholder token count is
        equal to the length of multimodal features. If the lengths are different, an error is raised.
        N)ri   r   rb   r   r   z6Image features and image tokens do not match: tokens: z, features )rm   r    tensorr0   image_token_idlongr   allsum	unsqueeze	expand_asr   r   numelr   )r<   r   r   r@   special_image_maskn_image_tokensn_image_featuress          r%   get_placeholder_maskz#PaliGemmaModel.get_placeholder_mask   s/    !.2M$2K2K2M2MT[7uzR_Rfggg3 3 " "4!7!7!;!;!*dk.H!H+//11/99"==GGVVYYZgZnoo)/2^5I!5LL+,22448L8L8N8NNNvvvdtvv   "!r$   r   position_idsr*   r   r   labels	use_cacheoutput_attentionsoutput_hidden_statesreturn_dictkwargsreturnc                    |du |duz  rt          d          ||n| j        j        }||n| j        j        }||n| j        j        }|duo|	du}|?| j        j        | j        k    r*|| j        j        k    }|                                }d||<   n|}| |                                 |          }|B||	                                nd}t          j        |||j        d         z   |j                  }||                    d          dz   }|c|                     |          }|                    |j        |j                  }|                     |||          }|                    ||          }|                     ||||||          } | j        d
|||||
||d|d	|}t-          |j        |j        |j        |j        ||nd	          S )i  
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
            config.text_config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.text_config.vocab_size]`.

        Example:

        ```python
        >>> from PIL import Image
        >>> import requests
        >>> from transformers import AutoProcessor, PaliGemmaForConditionalGeneration

        >>> model = PaliGemmaForConditionalGeneration.from_pretrained("google/paligemma2-3b-mix-224")
        >>> processor = AutoProcessor.from_pretrained("google/paligemma2-3b-mix-224")

        >>> prompt = "Where is the cat standing?"
        >>> url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/pipeline-cat-chonk.jpeg"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> inputs = processor(images=image, text=prompt,  return_tensors="pt")

        >>> # Generate
        >>> generate_ids = model.generate(**inputs,)
        >>> processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
        "Where is the cat standing?\nsnow"
        ```Nz:You must specify exactly one of input_ids or inputs_embedsr   r   r   )r   r@   T)	r   r   r*   r   r   r   r   r   r   )r   r*   r+   r,   r   r#   )r   r0   r   r   use_return_dictr   rg   r   rm   get_seq_lengthr    r   r   r   r   r   r   ri   r   masked_scatterr   r`   r   r   r*   r+   r,   )r<   r   r   r   r   r*   r   r   r   r   r   r   r   r   r   rz   r   llm_input_idspast_seen_tokensr@   r   outputss                         r%   rA   zPaliGemmaModel.forward  sy   ^ -t";< 	[YZZZ1B1N--TXT_Tq$8$D  $+Jj 	 &1%<kk$+B]$D0GV45G  T[%?4?%R%R!*dk.H!H%OO--M01M,--%M 7D5577FFM!CRC^==???de"\ "2]5H5K"KTaTh  N )33A66:L #!44\BBN+..}/C]EXYYN!%!:!:~ "; " " *889K^\\M..NO^]\g
 
 &$% 
&%+'/!5)
 
 
 
 ,%7#3!/)2>2JPT
 
 
 	
r$   )NNNNN)NNNNNNNNNNNNN)r   r   r   _checkpoint_conversion_mappingaccepts_loss_kwargsr   r5   rm   rp   rw   ry   r   boolr   r    r!   r   
LongTensorr   r   r   r   r   r   r   r   r-   r   rA   rB   rC   s   @r%   r_   r_   }   s        '=>N%O"      : : :8 8 8& & &# # # &*C C d^C C C CJu/@     ")":?:K"]b]n" " " "0  15481537+/595959-1$(,0/3&*k
 k
E,-k
 u01k
 !.	k

 u/0k
 "%k
 !!12k
 !!12k
   12k
 )*k
 D>k
 $D>k
 'tnk
 d^k
 -.k
  
u22	3!k
 k
 k
 ^ k
 k
 k
 k
 k
r$   r_   c            %           e Zd ZdddddZdgZdef fdZd	 Zd
 Zd Z	d Z
d Zed             Zed             Zed             Zee	 	 	 	 	 	 	 	 	 	 	 	 	 	 d+deej                 deej                 deej                 deej                 dee         deej                 deej                 deej                 deej                 dee         dee         dee         dee         d eeej        f         d!ee         d"eeef         f d#                        Z	 	 	 	 	 	 	 	 	 	 d, fd%	Z e!dej        d&ed'ed(ej"        dej        d)efd*            Z# xZ$S )-!PaliGemmaForConditionalGenerationzmodel.language_modelzmodel.vision_towerzmodel.multi_modal_projectorlm_head)z^language_model.modelz^vision_towerz^multi_modal_projectorz^language_model.lm_headzlm_head.weightr0   c                     t                                          |           t          |          | _        t	          j        |j        j        |j        j        d          | _	        | 
                                 d S )NFr2   )r4   r5   r_   modelr   r6   rf   r8   rg   r   rk   r;   s     r%   r5   z*PaliGemmaForConditionalGeneration.__init__  se       #F++
y!3!?ASA^ejkkkr$   c                 4    | j                                         S r?   )r   rm   rn   s    r%   rm   z6PaliGemmaForConditionalGeneration.get_input_embeddings  s    z..000r$   c                 :    | j                             |           d S r?   )r   rp   rq   s     r%   rp   z6PaliGemmaForConditionalGeneration.set_input_embeddings  s    
''.....r$   c                 :    | j                             |           d S r?   )r   rw   ru   s     r%   rw   z-PaliGemmaForConditionalGeneration.set_decoder  s    
w'''''r$   c                 4    | j                                         S r?   )r   ry   rn   s    r%   ry   z-PaliGemmaForConditionalGeneration.get_decoder  s    z%%'''r$   c                 6    | j                             |          S r?   )r   r   )r<   r   s     r%   r   z4PaliGemmaForConditionalGeneration.get_image_features  s    z,,\:::r$   c                     | j         j        S r?   )r   r`   rn   s    r%   r`   z0PaliGemmaForConditionalGeneration.language_model  s    z((r$   c                     | j         j        S r?   )r   rd   rn   s    r%   rd   z.PaliGemmaForConditionalGeneration.vision_tower  s    z&&r$   c                     | j         j        S r?   )r   re   rn   s    r%   re   z7PaliGemmaForConditionalGeneration.multi_modal_projector  s    z//r$   Nr   r   r   r   r   r*   r   r   r   r   r   r   r   r   logits_to_keepr   r   c                    ||n| j         j        }||n| j         j        }||n| j         j        } | j        d||||||||
|	||d|d|}|d         }t          |t                    rt          | d          n|}|                     |dd|ddf                   }d}|	  | j	        d||	| j         j
        j        d|}t          |||j        |j        |j        |j                  S )r   NT)r   r   r   r   r   r*   r   r   r   r   r   r   r   r   )r)   r   rg   )r(   r)   r*   r+   r,   r   r#   )r0   r   r   r   r   rN   intslicer   loss_functionrf   rg   r'   r*   r+   r,   r   )r<   r   r   r   r   r*   r   r   r   r   r   r   r   r   r   r   r   r+   slice_indicesr)   r(   s                        r%   rA   z)PaliGemmaForConditionalGeneration.forward  sg   ^ 2C1N--TXT_Tq$8$D  $+Jj 	 &1%<kk$+B]$* 
%))%+'/!5)
 
 
 
"  
8B>SV8W8Wk~ot444]kmAAA}aaa,?@AA%4% f9P9[ _e D /#3!/) ' ;
 
 
 	
r$   Tc                     t                      j        |f||||||	|
|d|}|                    d          |dxx         dz  cc<   |d         dk    r||d<   |d uo|d u}t          |t                    ot          |j                  }|d         dk    r,|r*||n|}| j                            ||||||          }||d<   |S )N)r*   r   r   r   r   r   r   r   r   r   r   r   r   )	r4   prepare_inputs_for_generationgetrN   r	   any
is_slidingr   r   )r<   r   r*   r   r   r   r   r   r   r   r   r   r   model_inputsrz   is_static_hybrid_cacher   r   r=   s                     r%   r   z?PaliGemmaForConditionalGeneration.prepare_inputs_for_generation	  s,     =uww<
+')%)))
 
 
 
 N++7(((A-((( !!!+7L($D0GV45G!+O[!I!I!mcRaRlNmNm!!!&<!,9,E==9L*88Q]_j K .9L)*r$   r   r   ri   
batch_sizec                    | |                                  dk    r| }nMt          j        |          j        }t          j        ||f|||j                  }|dk    rt          j        |d          }|t          j        ||j                  |                    dd          k    z  }|ddddddf         	                    |ddd          }| |
                                }| j        d         }	|ddddddd|	f         | ddddddf                             |j                  z   }
|
dk    }
|ddddddd|	f                             |
|          |ddddddd|	f<   |S )	aM  
        Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape
        `(batch_size, key_value_length)`, or if the input `attention_mask` is already 4D, do nothing.

        Args:
            attention_mask (`torch.Tensor`):
                A 2D attention mask of shape `(batch_size, key_value_length)` or a 4D attention mask of shape
                `(batch_size, 1, query_length, key_value_length)`.
            sequence_length (`int`):
                The sequence length being processed.
            target_length (`int`):
                The target length: when generating with static cache, the mask should be as long as the static cache,
                to account for the 0 padding, the part of the cache that is not filled yet.
            dtype (`torch.dtype`):
                The dtype to use for the 4D attention mask.
            cache_position (`torch.Tensor`):
                Indices depicting the position of the input sequence tokens in the sequence.
            batch_size (`torch.Tensor`):
                Batch size.
        Nr}   r~   r   r   r   rb   r   )r   r    r   r   r   r   r   r   r   r   r   r   r   r   )r   r   r   ri   r   r   r   r   r   r   r   s              r%   5_prepare_4d_causal_attention_mask_with_cache_positionzWPaliGemmaForConditionalGeneration._prepare_4d_causal_attention_mask_with_cache_position8  s   > %.*<*<*>*>!*C*C(KKE**.I* -0Ye\j\q  K !###jqAAA5<n>STTTWeWmWmnprsWtWtttK%dD!!!QQQ&67>>z1bRTUUK))//11,226*111aaaL[L+@ANSTSTSTVZ\`bcbcbcScDdDgDg&E E    ,q05@AAAqqq,;,AV5W5c5c )6 6AAAqqq!!!\k\12 r$   )NNNNNNNNNNNNNr   )
NNNNNNNTNN)%r   r   r   r   _tied_weights_keysr   r5   rm   rp   rw   ry   r   propertyr`   rd   re   r   r   r   r    r   r!   r   r   r   r   r   r   r   r-   r'   rA   r   staticmethodri   r   rB   rC   s   @r%   r   r     s#        "8-"?#,	& &" ++      1 1 1/ / /( ( (( ( (; ; ; ) ) X) ' ' X' 0 0 X0  15481537+/595959-1$(,0/3&*34V
 V
E,-V
 u01V
 !.	V

 u/0V
 "%V
 !!12V
 !!12V
   12V
 )*V
 D>V
 $D>V
 'tnV
 d^V
 c5</0V
  +,!V
" 
u55	6#V
 V
 V
 ^ V
v - - - - - -^ 444 4 {	4
 4 4 4 4 \4 4 4 4 4r$   r   )r   rE   r_   )*r   dataclassesr   typingr   r   r    r   cache_utilsr   r	   
generationr
   modeling_flash_attention_utilsr   modeling_outputsr   modeling_utilsr   processing_utilsr   utilsr   r   r   r   r   autor   configuration_paligemmar   
get_loggerr   loggerr   r'   Moduler/   rE   r_   r   __all__r#   r$   r%   <module>r      s     ! ! ! ! ! ! " " " " " " " "        - - - - - - - - ) ) ) ) ) ) B B B B B B 7 7 7 7 7 7 - - - - - - & & & & & &                    4 4 4 4 4 4 
	H	%	%   
< < < < <#: < <  <   
< < < < <k < <  <0    29    ) ) ) ) ) ) ) )0   
z
 z
 z
 z
 z
- z
 z
 
z
z   
j j j j j(@/ j j 
jZ ^
]
]r$   