
     `i48                        d dl mZmZ d dlZd dlmZ ddlmZ ddlmZ ddl	m
Z
 ddlmZ d	d
lmZmZmZmZmZmZ d	dlmZ ddlmZ  ej        e          Z G d de          Z G d dej                  Z G d dej                  Z G d de          Z G d de          Z  G d de          Z! G d de          Z" G d de          Z#g dZ$dS )    )OptionalUnionN)nn   )ACT2FN)Cache)Unpack)logging   )LlavaCausalLMOutputWithPastLlavaForConditionalGeneration
LlavaModelLlavaModelOutputWithPastLlavaPreTrainedModelTransformersKwargs)MistralRMSNorm   )Mistral3Configc                       e Zd ZdS )Mistral3RMSNormN__name__
__module____qualname__     /home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/transformers/models/mistral3/modular_mistral3.pyr   r   (           Dr   r   c                   Z     e Zd ZdZdef fdZdej        dej        dej        fdZ xZ	S )Mistral3PatchMergerz<
    Learned merging of spatial_merge_size ** 2 patches
    configc                     t                                                       || _        |j        j        }|j        | _        | j        j        j        | _        t          j        || j        dz  z  |d          | _	        d S )Nr   Fbias)
super__init__r!   vision_confighidden_sizespatial_merge_size
patch_sizer   Linearmerging_layer)selfr!   r(   	__class__s      r   r&   zMistral3PatchMerger.__init__1   ss    *6"(";+3>Y{T5La5O'OQ\chiiir   image_featuresimage_sizesreturnc                      fd|D             }d |D             }|j         d         }g }t          |                    |                    D ]\  }}||         \  }}	|                    ||	|                              ddd                              d          }
t          j        j        	                    |
 j
         j
                  }|                    | j
        dz  z  d                                          }|                    |           t          j        |d          }                     |          }|S )	Nc                 P    g | ]"}|d          j         z  |d         j         z  f#S )r   r   )r*   ).0
image_sizer-   s     r   
<listcomp>z/Mistral3PatchMerger.forward.<locals>.<listcomp>;   sA     
 
 
U_Z]do-z!}/OP
 
 
r   c                     g | ]
\  }}||z  S r   r   )r4   hws      r   r6   z/Mistral3PatchMerger.forward.<locals>.<listcomp>?   s     :::daAE:::r   r   r   r   )kernel_sizestridedim)shape	enumeratesplitviewpermute	unsqueezetorchr   
functionalunfoldr)   tappendcatr,   )r-   r/   r0   tokens_per_imagedpermuted_tensorimage_indeximage_tokensr8   r9   
image_gridgrids   `           r   forwardzMistral3PatchMerger.forward:   sa   
 
 
 
cn
 
 
 ;:k::: $)2>3G3GHX3Y3Y)Z)Z 	) 	)%K{+DAq%**1a33;;Aq!DDNNqQQJ8&--(?H_ .  D 99Q!8!!;;R@@BBDDD""4((((?:::++N;;r   )
r   r   r   __doc__r   r&   rE   TensorrR   __classcell__r.   s   @r   r    r    ,   s         j~ j j j j j jel  RWR^        r   r    c                   H     e Zd Zdef fdZdej        dej        fdZ xZS )Mistral3MultiModalProjectorr!   c                 ,   t                                                       t          |j        j        |j        j                  | _        t          |          | _	        t          |j        t                    rdnt          |j                  }t          j        |j        j        |z  |j        j        |j                  | _        t$          |j                 | _        t          j        |j        j        |j        j        |j                  | _        d S )N)epsr   r#   )r%   r&   r   r'   r(   text_configrms_norm_epsnormr    patch_merger
isinstancevision_feature_layerintlenr   r+   multimodal_projector_biaslinear_1r   projector_hidden_actactlinear_2)r-   r!   num_feature_layersr.   s      r   r&   z$Mistral3MultiModalProjector.__init__S   s    #F$8$D&J\Jijjj	/77",V-H#"N"NtQQTWX^XsTtTt	 ,/AA*1
 
 

 &56	*F,>,JQWQq
 
 
r   r/   r0   c                     |                      |          }|                     ||          }|                     |          }|                     |          }|                     |          }|S N)r]   r^   rd   rf   rg   )r-   r/   r0   hidden_statess       r   rR   z#Mistral3MultiModalProjector.forwardc   sa    >22**>;GGn55//m44r   )	r   r   r   r   r&   rE   rT   rR   rU   rV   s   @r   rX   rX   R   sj        
~ 
 
 
 
 
 
 el         r   rX   c                       e Zd ZdS )Mistral3CausalLMOutputWithPastNr   r   r   r   rm   rm   l   r   r   rm   c                       e Zd ZdS )Mistral3ModelOutputWithPastNr   r   r   r   ro   ro   p   r   r   ro   c                       e Zd ZdS )Mistral3PreTrainedModelNr   r   r   r   rq   rq   t   r   r   rq   c            !          e Zd Z	 ddej        dej        deeee	e         f                  fdZ
	 	 	 	 	 	 	 	 	 	 	 	 	 ddeej                 deej                 deej                 deej                 d	ee         d
eej                 deeee	e         f                  dee         dee         dee         dee         deej                 deej                 dee         deeef         fdZdS )Mistral3ModelNpixel_valuesr0   r`   c                   	
 ||n| j         j        }d |                                D             } | j        |f|dd|
t	          |t
                    r
j        |         }n$
fd|D             }t          j        |d          }| 	                    |
                    d          |          }| j        j        | j         j        z  		fd	|D             }t          j        |
                    d          |          }|S )
aU  
        Obtains image last hidden states from the vision tower and apply multimodal projection.

        Args:
            pixel_values (`torch.FloatTensor]` of shape `(batch_size, channels, height, width)`):
               The tensors corresponding to the input images.
            vision_feature_layer (`Union[int, list[int]]`, *optional*):
                The index of the layer to select the vision feature. If multiple indices are provided,
                the vision feature of the corresponding indices will be concatenated to form the
                vision features.
            image_sizes (`torch.Tensor`, *optional*):
                Tensor containing the image sizes as returned by the processor.
        Returns:
            image_features (`torch.Tensor`): Image feature tensor of shape `(num_images, image_length, embed_dim)`).
        Nc                     i | ]
\  }}|||S rj   r   )r4   kvs      r   
<dictcomp>z4Mistral3Model.get_image_features.<locals>.<dictcomp>   s    CCC41aQ]!Q]]]r   T)r0   output_hidden_statesc                 *    g | ]}j         |         S r   )rk   )r4   	layer_idximage_outputss     r   r6   z4Mistral3Model.get_image_features.<locals>.<listcomp>   s!    ddd)}29=dddr   r:   r=   r   c                 ,    g | ]\  }}|z  |z  z  S r   r   )r4   heightwidthdownsample_ratios      r   r6   z4Mistral3Model.get_image_features.<locals>.<listcomp>   s1    sssVcV\^c"22u@P7PQsssr   )r!   r`   itemsvision_towerr_   ra   rk   rE   rJ   multi_modal_projectorsqueezer*   r)   rA   )r-   rt   r0   r`   kwargsselected_image_featurehs_poolr/   split_sizesr   r}   s            @@r   get_image_featuresz Mistral3Model.get_image_featuresy   s1   . %9$D  $+Jj 	 DC6<<>>CCC)),uKfjuuntuu *C00 	@%2%@AU%V""ddddOcdddG%*YwB%?%?%?"334J4R4RST4U4UWbcc,7$+:XXssssgrsss^%;%;A%>%>LLr   	input_idsattention_maskposition_idspast_key_valuesinputs_embeds	use_cacheoutput_attentionsrz   return_dictcache_positionr   r1   c                 n   |	|	n| j         j        }	|
|
n| j         j        }
||n| j         j        }||n| j         j        }|d u |d uz  rt          d          | |                                 |          }|z|                     |||          }t          j	        |d          
                    |j        |j                  }|                     |||          }|                    ||          } | j        d	||||||	|
d|d	|}t!          |j        |j        |j        |j        ||nd           S )
Nz:You must specify exactly one of input_ids or inputs_embeds)rt   r`   r0   r   r=   )r   r/   T)	r   r   r   r   r   r   rz   r   r   )last_hidden_stater   rk   
attentionsimage_hidden_statesr   )r!   r   rz   use_return_dictr`   
ValueErrorget_input_embeddingsr   rE   rJ   todevicedtypeget_placeholder_maskmasked_scatterlanguage_modelro   r   r   rk   r   )r-   r   rt   r   r   r   r   r`   r   r   rz   r   r   r0   r   r/   special_image_maskoutputss                     r   rR   zMistral3Model.forward   s   " 2C1N--TXT_Tq$8$D  $+Jj 	 &1%<kk$+B]$8$D  $+Jj 	 -t";< 	[YZZZ 7D5577	BBM#!44)%9' 5  N
 #Y~1===@@AUWdWjkkN!%!:!:~ "; " " *889K^\\M%$% 
)%+'/!5)
 
 
 
 +%7#3!/)2>2JPT
 
 
 	
r   rj   )NNNNNNNNNNNNN)r   r   r   rE   FloatTensorrT   r   r   ra   listr   
LongTensorr   boolr	   r   tuplero   rR   r   r   r   rs   rs   x   s       
 AE	) )') \) 'uS$s)^'<=	) ) ) )Z 15481537+/59@D$(,0/3&*59.2?
 ?
E,-?
 u01?
 !.	?

 u/0?
 "%?
   12?
 'uS$s)^'<=?
 D>?
 $D>?
 'tn?
 d^?
 !!12?
 el+?
 +,?
  
u11	2!?
 ?
 ?
 ?
 ?
 ?
r   rs   c            #          e Zd Z	 ddej        dej        deeee	e         f                  fdZ
	 	 	 	 	 	 	 	 	 	 	 	 	 	 ddeej                 deej                 deej                 d	eej                 d
ee         deej                 deej                 dee         dee         dee         dee         deej                 deeej        f         deej                 dee         deeef         f dZdS ) Mistral3ForConditionalGenerationNrt   r0   r`   c                 .     | j         j        d|||d|S )N)rt   r0   r`   r   )modelr   )r-   rt   r0   r`   r   s        r   r   z3Mistral3ForConditionalGeneration.get_image_features   s:     -tz, 
%#!5
 
 	
 
 	
r   r   r   r   r   r   r   labelsr   r   rz   r   r   logits_to_keepr   r1   c                    |	|	n| j         j        }	|
|
n| j         j        }
||n| j         j        } | j        d||||||||	|
d||d|}|d         }t          |t                    rt          | d          n|}|                     |dd|ddf                   }d}|  | j	        d||| j         j
        j        d|}t          |||j        |j        |j        |j                  S )a  
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.

        Example:

        ```python
        >>> from PIL import Image
        >>> import requests
        >>> from transformers import AutoProcessor, Mistral3ForConditionalGeneration

        >>> model = Mistral3ForConditionalGeneration.from_pretrained("mistralai/Mistral-Small-3.1-24B-Instruct-2503")
        >>> processor = AutoProcessor.from_pretrained("mistralai/Mistral-Small-3.1-24B-Instruct-2503")

        >>> prompt = "<s>[INST][IMG]What is the image?[/INST]"
        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> inputs = processor(images=image, text=prompt, return_tensors="pt")

        >>> # Generate
        >>> generate_ids = model.generate(**inputs, max_new_tokens=15)
        >>> processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
        "What is the image?The image depicts two cats lying on a pink blanket."
        ```NT)r   rt   r   r   r   r   r   r   rz   r   r   r0   r   )logitsr   
vocab_size)lossr   r   rk   r   r   r   )r!   r   rz   r   r   r_   ra   slicelm_headloss_functionr[   r   rm   r   rk   r   r   )r-   r   rt   r   r   r   r   r   r   r   rz   r   r   r   r0   r   r   rk   slice_indicesr   r   s                        r   rR   z(Mistral3ForConditionalGeneration.forward   sd   Z 2C1N--TXT_Tq$8$D  $+Jj 	 &1%<kk$+B]$* 
%)%+'/!5)#
 
 
 
   
8B>SV8W8Wk~ot444]kmAAA}aaa,?@AA%4% f9P9[ _e D .#3!/) ' ;
 
 
 	
r   rj   )NNNNNNNNNNNNr   N)r   r   r   rE   r   rT   r   r   ra   r   r   r   r   r   r	   r   r   rm   rR   r   r   r   r   r      s       
 AE	
 
'
 \
 'uS$s)^'<=	
 
 
 
  15481537+/59-1$(,0/3&*5934.2U
 U
E,-U
 u01U
 !.	U

 u/0U
 "%U
   12U
 )*U
 D>U
 $D>U
 'tnU
 d^U
 !!12U
 c5</0U
 el+U
  +,!U
" 
u44	5#U
 U
 U
 U
 U
 U
r   r   )rs   rq   r   )%typingr   r   rE   r   activationsr   cache_utilsr   processing_utilsr	   utilsr
   llava.modeling_llavar   r   r   r   r   r   mistral.modeling_mistralr   configuration_mistral3r   
get_loggerr   loggerr   Moduler    rX   rm   ro   rq   rs   r   __all__r   r   r   <module>r      s    # " " " " " " "        ! ! ! ! ! !             & & & & & &                      6 5 5 5 5 5 2 2 2 2 2 2 
	H	%	%	 	 	 	 	n 	 	 	# # # # #") # # #L    ")   4	 	 	 	 	%@ 	 	 		 	 	 	 	": 	 	 		 	 	 	 	2 	 	 	k
 k
 k
 k
 k
J k
 k
 k
\d
 d
 d
 d
 d
'D d
 d
 d
N  r   