
    .`i              
       6   d dl Z d dlmZmZmZ d dlmZmZ d dlm	Z	 d dl
mZmZ d dlZd dlmZ d dlmc mZ d dlmZmZ d dlmZ d dlmZ d d	lmZ d d
lmZ d dlmZm Z m!Z! d dl"m#Z# d dl$m%Z& d dl'm(Z(m)Z)m*Z* d dl+m,Z, d dl-m.Z. d dl/m0Z0 d dl1m2Z2m3Z3 d dl4m5Z5 d dl6m7Z7 d dl8m9Z9 d dl:m;Z;m<Z<m=Z= d dl>m?Z? d dl@mAZA d dlBmCZCmDZD d dlEmFZFmGZGmHZHmIZI d dlJmKZKmLZLmMZM d dlNmOZOmPZP d dlQmRZRmSZSmTZTmUZUmVZVmWZW d dlXmYZY d dlZm[Z[ d d l\m]Z] d d!l^m_Z_ d d"l`maZambZb d#d$lcmdZdmeZemfZfmgZg d#d%lhmiZi d#d&ljmkZkmlZl d#d'lmmnZnmoZompZpmqZq 	 d d(lrmsZt  eYju                    r eYjv        d)          rd*Zwnd+Zwn# ex$ r d*ZwY nw xY wd,Zy G d- d.ea          Zz G d/ d0          Z{ G d1 d2eS          Z| G d3 d4eOe|                   Z} G d5 d6eRe|                   Z~ eCj        e~e|e}7           G d8 d9ej        eeefeg                      Ze G d: d;                      Zd<ej        d=ej        d>ej        fd?Zd@edAedBedCed>ej        f
dDZdEej        dFej        d<ej        d>eej        ej        f         fdGZ G dH dIej                  Z G dJ dKej                  Z G dL dMej                  Z G dN dOej                  ZdPeej                 d>ej        fdQZ G dR dSej                  Z G dT dUej                  Z G dV dWej                  Zd=ej        dXeeeef                  dYed>eej                 fdZZ G d[ d\ene                    Z G d] d^ej                  Z G d_ d`ej                  Z G da dbej                  Z G dc ddej                  Z G de dfej                  ZdS )g    N)IterableMappingSequence)	dataclassfields)cached_property)	AnnotatedLiteral)
ImageChunk	TextChunk)UserMessage)ChatCompletionRequest)ImageEncoder)Image)BatchFeaturePixtralVisionConfig
TensorType)
ImageInput)_num_image_tokens)PixtralRotaryEmbeddingapply_rotary_pos_embposition_ids_in_meshgrid)	TextInput)
VllmConfig)BaseDummyOptions)divide$get_tensor_model_parallel_world_size)get_act_and_mul_fn)Conv2dLayer)RMSNorm)MergedColumnParallelLinearQKVParallelLinearRowParallelLinear)QuantizationConfig)default_weight_loader)MULTIMODAL_REGISTRYMultiModalKwargsItems)MultiModalDataDictMultiModalFieldConfigMultiModalUUIDDictNestedTensors)ImageProcessorItems	ImageSizeMultiModalDataItems)BaseDummyInputsBuilderProcessorInputs)BaseMultiModalProcessorBaseProcessingInfoMultiModalProcessingInfoPromptReplacementPromptUpdatePromptUpdateDetails)current_platform)IntermediateTensors)cached_tokenizer_from_config)MistralTokenizer)TensorSchemaTensorShape   )MultiModalEmbeddingsSupportsLoRASupportsMultiModal
SupportsPP)MultiModelKeys)init_vllm_registered_modelmaybe_prefix)VisionEncoderInfoVisionFeatureSelectStrategyis_vit_use_data_parallelresolve_visual_encoder_outputs)opsd   FTpatch_mergec            
           e Zd ZU dZdZed         ed<   eej	        e
ej	                 z   eddddddh          f         ed	<   d
S )PixtralImagePixelInputsz
    Dimensions:
        - bn: Batch size * number of images
        - c: Number of channels (3)
        - h: Height of each image
        - w: Width of each image

    The result of stacking `ImageEncoding.tokens` from each prompt.
    pixel_valuestypebn   hw)dynamic_dimsimagesN)__name__
__module____qualname____doc__rO   r
   __annotations__r	   torchTensorlistr<        v/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/vllm/model_executor/models/pixtral.pyrM   rM   `   s           %3D'.
!222tEL))D!S#S#J???	A     r_   rM   c                   @    e Zd ZdZdeddf fdZedefd            Ze	de
fd            Ze	de
fd            Ze	de
fd	            Ze	de
fd
            Ze	de
fd            Z	 	 	 ddeee         z  dz  deee         z  dz  deez  dz  deeef         fdZ xZS )PixtralProcessorAdapterzo
    Provide a HF-compatible interface for
    `mistral_common.tokens.tokenizers.multimodal.ImageEncoder`.
    	tokenizerreturnNc                 V    t                                                       || _        d S N)super__init__rc   )selfrc   	__class__s     r`   rh   z PixtralProcessorAdapter.__init__y   s$    "r_   c                 V    | j         j        j        }t          |t                    sJ |S rf   )rc   instruct
mm_encoder
isinstancer   )ri   image_encoders     r`   image_processorz'PixtralProcessorAdapter.image_processor~   s+    /:-66666r_   c                 $    | j         j        j        S rf   )rp   special_ids	img_breakri   s    r`   image_break_idz&PixtralProcessorAdapter.image_break_id   s    #/99r_   c                 $    | j         j        j        S rf   )rp   rr   imgrt   s    r`   image_token_idz&PixtralProcessorAdapter.image_token_id   s    #/33r_   c                 $    | j         j        j        S rf   )rp   rr   img_endrt   s    r`   image_end_idz$PixtralProcessorAdapter.image_end_id   s    #/77r_   c                 $    | j         j        j        S rf   )rp   	mm_configmax_image_sizert   s    r`   
image_sizez"PixtralProcessorAdapter.image_size   s    #-<<r_   c                 $    | j         j        j        S rf   )rp   r}   image_patch_sizert   s    r`   
patch_sizez"PixtralProcessorAdapter.patch_size   s    #->>r_   textrU   return_tensorsc                 <   |g }t          |t                    s|g}|g }t          |t                    s|g}|s0|                     |          j        }dt	          j        |          iS t          d |D                       rt          d          t          t          j                             }t          t          j                             }|D ]}| 	                    t          |                    }	t	          j        |	j                  }
t	          j        |	j                  }|                    |
           |                    |           t          t	          j        |          d                              t#          |          d          |d          S )N	input_idsc              3   <   K   | ]}t          |          d k    V  dS )r   N)len).0ts     r`   	<genexpr>z3PixtralProcessorAdapter.__call__.<locals>.<genexpr>   s,      ((as1vvz((((((r_   zYou've passed text inputs instead of token inputs. Make sure to process your input via `mistral_common`'s tokenizer or pass a chat completion request. For more info, see: https://github.com/vllm-project/vllm/issues/8411.image)r   rU   )rn   r]   rc   r   r[   tensorany
ValueErrorr\   rp   r   r   tokensappendr   catexpandr   )ri   r   rU   r   kwargsr   images_processedimages_tokensr   image_inputsimage_processedimage_tokenss               r`   __call__z PixtralProcessorAdapter.__call__   s    <D$%% 	6D>F&$'' 	XF 	:t,,6Ii!8!899 ((4((((( 	D    -//U\*,, 	/ 	/E//
0G0G0GHHL#l<+=>>O <(;<<L##O444  ...."Y}55d;BB3t99bQQ* 
 
 	
r_   )NNN)rV   rW   rX   rY   r:   rh   propertyr   rp   r   intru   rx   r{   r   r   r   r]   r   strr   r   r+   r   __classcell__rj   s   @r`   rb   rb   s   s        
#"2 #t # # # # # #
     X
 : : : : _: 4 4 4 4 _4 8c 8 8 8 _8 =C = = = _= ?C ? ? ? _?
 487;26	/
 /
$y/)D0/
 T*--4/
 j(4/	/
 
m#	$/
 /
 /
 /
 /
 /
 /
 /
r_   rb   c            	           e Zd ZdefdZdefdZdeee	dz  f         fdZ
	 ddedz  fdZddd	e	d
e	dedz  de	fdZdefdZdS )PixtralProcessingInford   c                     t          | j        j                  }t          |t                    st          d          |S )Nz.This model requires `--tokenizer-mode mistral`)r9   ctxmodel_configrn   r:   r   )ri   rc   s     r`   get_tokenizerz#PixtralProcessingInfo.get_tokenizer   s=    01FGG	)%566 	OMNNNr_   c                 D    t          |                                           S rf   )rb   r   rt   s    r`   get_hf_processorz&PixtralProcessingInfo.get_hf_processor   s    &t'9'9';';<<<r_   Nc                 
    dd iS )Nr   r^   rt   s    r`   get_supported_mm_limitsz-PixtralProcessingInfo.get_supported_mm_limits   s    r_   	processorc                 d    ||                                  }t          |j        |j                  S )N)r   r   )r   r   r   r   )ri   r   s     r`   get_vision_configz'PixtralProcessingInfo.get_vision_config   s>     --//I" + +
 
 
 	
r_   )r   image_widthimage_heightc                    ||                                  }|j                            t          j        d||f                    \  }}||z  S )NRGB)r   rp   _image_to_num_tokensr   new)ri   r   r   r   ncolsnrowss         r`   get_num_image_tokensz*PixtralProcessingInfo.get_num_image_tokens   sW     --//I 0EEIek<899
 
u u}r_   c                 n    |                                  j        }|j        j        }t	          ||          S )N)widthheight)r   rp   r}   r~   r-   )ri   rp   r~   s      r`   !get_image_size_with_most_featuresz7PixtralProcessingInfo.get_image_size_with_most_features   s4    //11A(2A~nEEEEr_   rf   )rV   rW   rX   r:   r   rb   r   r   r   r   r   r   r   r-   r   r^   r_   r`   r   r      s       /    ="9 = = = =cDj)A    
 59

 

*T1

 

 

 

" 59    	
 +T1 
    F9 F F F F F Fr_   r   c            	           e Zd Zdeeef         defdZ	 d	dedeeef         deeef         dz  defdZ		 d	dedeeef         deeef         dz  de
fdZdS )
PixtralDummyInputsBuilder	mm_countsrd   c                     dS )N r^   )ri   r   s     r`   get_dummy_textz(PixtralDummyInputsBuilder.get_dummy_text   s    rr_   Nseq_len
mm_optionsc                     |                     dd          }| j                                        \  }}|r|                     d          nd }d|                     ||||          iS )Nr   r   )r   r   
num_images	overrides)getinfor   _get_dummy_images)ri   r   r   r   r   target_widthtarget_heightimage_overridess           r`   get_dummy_mm_dataz+PixtralDummyInputsBuilder.get_dummy_mm_data   s|     ]]7A..
&*i&Q&Q&S&S#m5?I*..111T T++"$%)	 ,  
 	
r_   c                    | j                                         }|                     |          }|                     |||          }|                    dg           }ddi}t          t          t          |          gd |D                       g          }	|j        	                    |	          }
|
j
        }t          |||          S )	Nr   
truncationF)r   c              3   6   K   | ]}t          |           V  dS )r   N)r   )r   r   s     r`   r   zGPixtralDummyInputsBuilder.get_dummy_processor_inputs.<locals>.<genexpr>&  s-      LLe*5111LLLLLLr_   )content)messages)promptmm_datatokenization_kwargs)r   r   r   r   r   r   r   r   mistralencode_chat_completionr   r0   )ri   r   r   r   rc   
dummy_textdummy_mm_datadummy_imagesr   requestresdummy_tokenss               r`   get_dummy_processor_inputsz4PixtralDummyInputsBuilder.get_dummy_processor_inputs  s     I++--	((33
..w	:NN$(("55+U3'!z222LL|LLL  	
 	
 	
 66w??z! 3
 
 
 	
r_   rf   )rV   rW   rX   r   r   r   r   r   r(   r   r0   r   r^   r_   r`   r   r      s        S(9 c     =A	
 

 38$
 C!112T9	

 

 
 
 
2 =A	
 

 38$
 C!112T9	

 

 
 
 
 
 
r_   r   c                   
    e Zd Zdeeef         deeef         deeef         fdZde	deeef         de
dee         fdZ	 dd	eee         z  d
e	deeef         deeef         dedz  deee         eef         f fdZ xZS )PixtralMultiModalProcessor	hf_inputshf_processor_mm_kwargsrd   c                 F    t          t          j        d                    S )Nr   )rU   )dictr)   batched)ri   r   r   s      r`   _get_mm_fields_configz0PixtralMultiModalProcessor._get_mm_fields_config6  s!    
 08AABBBBr_   mm_itemsout_mm_kwargsc                      | j         j        di |j        j        j        dt
          ffd}t          dd|          gS )Nitem_idxc                 *   	                     dt                    }|                    |           }
j                            t          j        d|j        |j        f                    \  }}g|z  gz   |z  }|d<   t          j
        |          S )Nr   r   r   )	get_itemsr,   get_image_sizerp   r   r   r   r   r   r6   select_token_id)r   rU   r   r   r   r   ru   r{   rx   r   r   s         r`   get_replacementzGPixtralMultiModalProcessor._get_prompt_updates.<locals>.get_replacementI  s    ''1DEEF..x88J$4II	%*"2J4E!FGG LE5 &&..1AAUJF%F2J&6v~NNNr_   r   r   )modalitytargetreplacementr^   )r   r   ru   rx   r{   r   r4   )	ri   r   r   r   r   ru   r{   rx   r   s	    `   @@@@r`   _get_prompt_updatesz.PixtralMultiModalProcessor._get_prompt_updates=  s     /DI.HH1GHH	"1"1 -	Oc 	O 	O 	O 	O 	O 	O 	O 	O 	O 	O  +  
 	
r_   Nr   mm_data_itemsr   mm_uuidsc                 b    t                                          |||||          \  }}}||dfS )N)r   r   r   r   r   T)rg   _cached_apply_hf_processor)
ri   r   r   r   r   r   
prompt_idsmm_info_rj   s
            r`   r   z5PixtralMultiModalProcessor._cached_apply_hf_processor^  sI     "'!C!C'#9 3 "D "
 "

GQ 7D((r_   rf   )rV   rW   rX   r   r   r+   objectr)   r   r.   r'   r   r5   r   r]   r   r*   tupler3   boolr   r   r   s   @r`   r   r   5  sO       C3-.C !(V 4C 
++	,	C C C C
%
 !(V 4
 -	

 
,	
 
 
 
N /3) )d3i) +) !(V 4	)
 %S&[1) %t+) 
tCy2D8	9) ) ) ) ) ) ) ) ) )r_   r   )r   dummy_inputsc                       e Zd Zededededz  fd            Zddded	ef fd
Zde	de
dz  fdZde
deej        df         fdZde	defdZ	 	 ddej        dej        dedz  dej        dz  de	dej        ez  fdZdej        dej        dz  fdZdeeeej        f                  fdZdefdZdedefdZdedefdZ xZS ) PixtralForConditionalGenerationr   ird   Nc                 N    |                     d          rd S t          d          )Nr   z Only image modality is supported)
startswithr   )clsr   r  s      r`   get_placeholder_strz3PixtralForConditionalGeneration.get_placeholder_strz  s,    w'' 	4;<<<r_   r   prefixvllm_configr  c          	         t                                                       |j        j        }|j        j        }|| _        || _        d t          t                    D             fd| j        j        	                                
                                D             }t          di || _        |                     |          5  t          ||j        t          |d                    | _        d d d            n# 1 swxY w Y   |                     |d          5  t%          | j                  | _        | j        j        rt+          | j        j        d          nd | _        | j        j        t2          k    r&t5          | j        j        | j        j        d	          nd | _        t;          | j        |j        j        
          | _        d d d            n# 1 swxY w Y   | j        j        | _        d S )Nc                     h | ]	}|j         
S r^   )name)r   fields     r`   	<setcomp>z;PixtralForConditionalGeneration.__init__.<locals>.<setcomp>  s    NNN5EJNNNr_   c                 $    i | ]\  }}|v 	||S r^   r^   )r   keyvaluedataclass_fieldss      r`   
<dictcomp>z<PixtralForConditionalGeneration.__init__.<locals>.<dictcomp>  s5     
 
 
U&&& &&&r_   language_model)r  	hf_configr  r   h㈵>epsF)vision_encoder_dimspatial_merge_sizeuse_mlp_biasdimr^   ) rg   rh   r   r  multimodal_configconfigr   VisionEncoderArgsvision_configto_dictitemsvision_args_mark_language_modelrC   text_configrD   r  _mark_tower_modelVisionTransformervision_encoderadd_pre_mm_projector_layer_normr    hidden_sizepre_mm_projector_normmm_projector_idPATCH_MERGEPatchMergerr  patch_mergerVisionLanguageAdaptervision_language_adaptermake_empty_intermediate_tensors)ri   r  r  r  r  r#  r  rj   s         @r`   rh   z(PixtralForConditionalGeneration.__init__  s   )3'4F!2NNF;L4M4MNNN
 
 
 
"k7??AAGGII
 
 
 -;;{;; &&{33 	 	"<' ,#F,<==# # #D	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 ##K99 	 	"3D4D"E"ED #C(4$???? & #3{BB '+'7'C'+'7'J!&      ,A f&8&D, , ,D(!	 	 	 	 	 	 	 	 	 	 	 	 	 	 	* ? 	,,,s%   	+D  DD!B/GG #G r   c                 X    |                     dd           }|d S t          d|          S )NrU   rN   )rO   rU   )poprM   )ri   r   rU   s      r`   _parse_and_validate_image_inputz?PixtralForConditionalGeneration._parse_and_validate_image_input  s>     Hd++>4&
 
 
 	
r_   image_input.c                    |d         }|                      |          }d |D             }t          j        |          }| j        |                     |          }| j        N| j        j        | j        j        dz  fd|D             }fd|D             }|                     ||          }|                     |          }t          j	        ||          }|S )NrU   c                 (    g | ]}|j         d          S r   shape)r   image_features     r`   
<listcomp>zHPixtralForConditionalGeneration._process_image_input.<locals>.<listcomp>  s     TTTM,Q/TTTr_      c                 P    g | ]"}|j         d          z  |j         d         z  f#S )r=   r>  r:  )r   rw   r   s     r`   r=  zHPixtralForConditionalGeneration._process_image_input.<locals>.<listcomp>  sD        1+SYq\Z-GH  r_   c                     g | ]}|z  S r^   r^   )r   feature_sizespatial_merge_size_squares     r`   r=  zHPixtralForConditionalGeneration._process_image_input.<locals>.<listcomp>  s/          99  r_   )image_sizes)
r(  r[   r   r+  r/  r#  r   r  r1  split)	ri   r6  rU   image_featuresfeature_sizesimg_patch_dimsimage_embedsr   rB  s	          @@r`   _process_image_inputz4PixtralForConditionalGeneration._process_image_input  s#    X&,,V44TT^TTT>22%1!77GGN()4J(,(8(KQ(N%   !  N   $1  M "..N /  N 33NCC{<??r_   c                 N     | j         di |}|g S |                     |          S )Nr^   )r5  rI  )ri   r   r6  s      r`   embed_multimodalz0PixtralForConditionalGeneration.embed_multimodal  s9    :d:DDVDDI((555r_   r   	positionsintermediate_tensorsinputs_embedsc                 J    |d}| j                             ||||          }|S )zRun forward pass for pixtral.N)rN  )r  model)ri   r   rL  rM  rN  r   hidden_statess          r`   forwardz'PixtralForConditionalGeneration.forward  s>      + M+11y"6m 2 
 
 r_   rQ  c                 6    | j                             |          S rf   )r  compute_logits)ri   rQ  s     r`   rT  z.PixtralForConditionalGeneration.compute_logits  s     "11-@@@r_   weightsc           
         	
 dt           t          t          j        f         fddt           t          t          j        f         fddt           t          t          j        f         fddt           t          t          j        f         fd j        &t           j                                                  ni 	 j        &t           j                                                  ni  j        &t           j                                                  ni  j	        &t           j	                                                  ni 
 	
f
d} j
                             |                       d S )Nweightc                 8    | d                              d          S )Nr   )r(  vision_towerr  rW  s    r`   is_vision_encoder_weightszOPixtralForConditionalGeneration.load_weights.<locals>.is_vision_encoder_weights  s    !9''(JKKKr_   c                 8    | d                              d          S )Nr   )r1  multi_modal_projectorrZ  r[  s    r`   is_vision_lang_adapter_weightszTPixtralForConditionalGeneration.load_weights.<locals>.is_vision_lang_adapter_weights  s!    !9''D  r_   c                 8    | d                              d          S )Nr   r/  rZ  r[  s    r`   is_patch_mergerzEPixtralForConditionalGeneration.load_weights.<locals>.is_patch_merger  s    !9''777r_   c                 8    | d                              d          S )Nr   r+  rZ  r[  s    r`   is_pre_mm_projector_normzNPixtralForConditionalGeneration.load_weights.<locals>.is_pre_mm_projector_norm  s    !9''(?@@@r_   c               3     
K   D ]h\  } } | |f          r
j         d                    |                     d          dd                    }                    |          }|;t	          j                    5  t          ||           d d d            n# 1 swxY w Y    | |f          r}
j        d                    |                     d          dd                    }|         }t	          j                    5  t          ||           d d d            n# 1 swxY w Y   ( | |f          r~
j        >d                    |                     d          dd                    }	|         }t	          j                    5  t          ||           d d d            n# 1 swxY w Y    | |f          r
j	        d                    |                     d          dd                    }                    |          }|;t	          j                    5  t          ||           d d d            n# 1 swxY w Y   M| 
                    d          } | |fV  jd S )N.r=   zlanguage_model.)r(  joinrD  r   r[   no_gradr%   r/  r+  r1  removeprefix)r  rS   trimmed_nameparamra  rc  r\  r_  patch_merger_dictpre_mm_projector_norm_dictri   vision_encoder_dictvision_lang_adapter_dictrU  s       r`   llm_weights_generatorzKPixtralForConditionalGeneration.load_weights.<locals>.llm_weights_generator  ss     " ($ ($a,,dAY77 '$*2 #&88DJJsOOABB,?#@#@L/33LAAE("]__ < <1%;;;< < < < < < < < < < < < < < <$_dAY// $(0 #&88DJJsOOABB,?#@#@L-l;E 8 8-eQ7778 8 8 8 8 8 8 8 8 8 8 8 8 8 8--tQi88 $19 #&88DJJsOOABB,?#@#@L6|DE 8 8-eQ7778 8 8 8 8 8 8 8 8 8 8 8 8 8 833T1I>> $3; #&88DJJsOOABB,?#@#@L488FFE("]__ < <1%;;;< < < < < < < < < < < < < < <  ,,->??D)OOOOQ($ ($sH   ;BB	B	D!!D%	(D%	F,,F0	3F0	)II
	I
	)r   r   r[   r\   r(  r   named_parametersr/  r+  r1  r  load_weights)ri   rU  ro  ra  rc  r\  r_  rk  rl  rm  rn  s   `` @@@@@@@@r`   rq  z,PixtralForConditionalGeneration.load_weights  s   	LeC4E.F 	L 	L 	L 	L	5el9J3K 	 	 	 	
	8E#u|*;$< 	8 	8 	8 	8	AU33D-E 	A 	A 	A 	A ". $5577888 	  , "3355666 	 )5 +<<>>??? 	# +7 ->>@@AAA 	!*	$ *	$ *	$ *	$ *	$ *	$ *	$ *	$ *	$ *	$ *	$ *	$ *	$ *	$Z 	(()>)>)@)@AAAAAr_   c                 0    t          j        ddd          S )Nr  r1  r(  )r  	connectortower_model)rB   from_string_fieldrt   s    r`   get_mm_mappingz.PixtralForConditionalGeneration.get_mm_mappingL  s%    /+/(
 
 
 	
r_   num_image_tokensc                 P    t          | dd           |S | j        j        }||dz  z  S Nr/  r>  getattrr#  r  )ri   rw  
merge_sizes      r`   get_num_mm_encoder_tokensz9PixtralForConditionalGeneration.get_num_mm_encoder_tokensS  s4    4..6##%8
:q=11r_   num_vision_tokensc                 P    t          | dd           |S | j        j        }||dz  z  S ry  rz  )ri   r~  r|  s      r`   get_num_mm_connector_tokensz;PixtralForConditionalGeneration.get_num_mm_connector_tokensY  s4    4..6$$%8
 Z]33r_   )NN)rV   rW   rX   classmethodr   r   r  r   rh   r   rM   r5  r   r[   r\   rI  r>   rK  r8   rR  rT  r   rq  rB   rv  r}  r  r   r   s   @r`   r   r   r  sS        =3 =3 =3: = = = [= BD .
 .
 .
z .
3 .
 .
 .
 .
 .
 .
`



	 4	'

 

 

 

, 
u|S 	!   86 64H 6 6 6 6 <@-1 < < 2D8	
 |d*  
+	+   $A|A 
	A A A ARBHU33D-E$F RB RB RB RBh
 
 
 
 
2# 2# 2 2 2 24S 4S 4 4 4 4 4 4 4 4r_   r   c                       e Zd ZU eed<   eed<   eed<   eed<   eed<   eed<   eed<   eed<   eed	<   d
Zeed<   dZeed<   dZ	eed<   dZ
eed<   dS )r  r*  num_channelsr   r   intermediate_sizenum_hidden_layersnum_attention_heads
rope_thetarx   Tadapter_biasr=   r  Fr)  r   r,  N)rV   rW   rX   r   rZ   floatr  r   r  r)  r,  r   r^   r_   r`   r  r  a  s         OOOOOOL$,1#T111OSr_   r  	freqs_cisxrd   c                    |j         dk    sJ | j        |j        d         |j        d         fk    s)J | j        |j        d         |j        d         ff            fdt          |j                  D             } | j        | S )zd
    freqs_cis: complex - (seq_len, head_dim / 2)
    x: complex - (bsz, seq_len, head_dim / 2)
    r=   r   c                 <    g | ]\  }}|d k    s	|d z
  k    r|nd S r=   r^   )r   r  dndims      r`   r=  z*_reshape_for_broadcast.<locals>.<listcomp>}  s5    PPPTQ!q&&AMMQQqPPPr_   )r  r;  	enumerateview)r  r  r;  r  s      @r`   _reshape_for_broadcastr  r  s    
 6D!8888?qwqz172;7777	
QWR[!:777 QPPPYqw=O=OPPPE9>5!!r_   r  r   r   thetac                    d|t          j        d| d                                          | z  z  z  }t          j        ||j                  }t          j        ||j                  }t          j        ||ddd                                                   }t          j        ||ddd                                                   }t          j        |dddddf                             d|d          |dddddf                             |dd          gd          }	t          j        t          j        |	          |	          S )	z
    freqs_cis: 2D complex tensor of shape (height, width, dim // 2)
        to be indexed by (height, width) position tuples
    g      ?r   r>  deviceNr=   r   r  )	r[   aranger  r  outerr   repeatpolar	ones_like)
r  r   r   r  freqsrR   rS   freqs_hfreqs_wfreqs_2ds
             r`   precompute_freqs_cis_2dr    s:    5U\!S!44::<<sBCDEVEL111AU5<000Ak!U33Q3Z((..00Gk!U14a4[))//11GyAAAtQQQJ&&q%33D!!!QQQJ&&vq!44	
   H ;ux00(;;;r_   xqxkc                 T   t          j         |                                 j        g | j        d d         ddR            }t          j         |                                j        g |j        d d         ddR            }|j        t           j        k    sJ t          ||          }t          j        ||z            	                    d          }t          j        ||z            	                    d          }|
                    |           |
                    |          fS )Nr   r>  rQ   )r[   view_as_complexr  reshaper;  dtype	complex64r  view_as_realflattentype_as)r  r  r  xq_xk_xq_outxk_outs          r`   apply_rotary_emb_vitr    s   
 
 2

 2 IBHSbSM I2 Iq I I I
J
JC

 2

 2 IBHSbSM I2 Iq I I I
J
JC?eo----&y#66Ii0088;;Fi0088;;F>>"v~~b1111r_   c                   H     e Zd Zdef fdZdej        dej        fdZ xZS )FeedForwardargsc                 >   t                                                       |j        J t          j        |j        |j        d          | _        t          j        |j        |j        d          | _        t          j        |j        |j        d          | _        d S NFbias)	rg   rh   r  nnLinearr*  w1w2w3ri   r  rj   s     r`   rh   zFeedForward.__init__  s    %111)D,d.D5QQQ)D2D4D5QQQ)D,d.D5QQQr_   r  rd   c                     |                      t          j        |                     |                    |                     |          z            S rf   )r  Fsilur  r  ri   r  s     r`   rR  zFeedForward.forward  s7    wwqvdggajj))DGGAJJ6777r_   	rV   rW   rX   r  rh   r[   r\   rR  r   r   s   @r`   r  r    sr        R. R R R R R R8 8%, 8 8 8 8 8 8 8 8r_   r  c                   d     e Zd Zdef fdZdej        dej        dej        dej        fdZ xZS )	Attentionr  c                    t                                                       || _        |j        |j        z  rJ |j        | _        |j        |j        z  | _        t          j        |j        |j        d          | _	        t          j        |j        |j        d          | _
        t          j        |j        |j        d          | _        t          j        |j        |j        d          | _        d S r  )rg   rh   r  r*  r  n_headshead_dimr  r  wqwkwvwor  s     r`   rh   zAttention.__init__  s    	#d&>>>>>/(D,DD)D,d.>UKKK)D,d.>UKKK)D,d.>UKKK)D,d.>UKKKr_   r  maskr  rd   c                 4   |j         \  }}}|                     |          |                     |          |                     |          }	}}|                    ||| j        | j                  }|                    ||| j        | j                  }|	                    ||| j        | j                  }	t          |||          \  }}t          rt          j
        |||	|          }
n{|                    dd          }|                    dd          }|	                    dd          }	t          j                            |||	|          }
|
                    dd          }
|
                    ||| j        | j        z            }
|                     |
          S )N)r  	attn_biasr=   r>  	attn_mask)r;  r  r  r  r  r  r  r  USE_XFORMERS_OPSxopsmemory_efficient_attention	transposer  
functionalscaled_dot_product_attentionr  )ri   r  r  r  batchpatchesr   qkvouts              r`   rR  zAttention.forward  sb    Gw''!**dggajj$''!**a1IIeWdlDMBBIIeWdlDMBBIIeWdlDMBB#AqI>>>1 	&1!QTJJJCCAq!!AAq!!AAq!!A-<<Q1PT<UUC--1%%Ckk%$,*FGGwws||r_   r  r   s   @r`   r  r    s        
L. 
L 
L 
L 
L 
L 
L< l <	
 
       r_   r  c                   d     e Zd Zdef fdZdej        dej        dej        dej        fdZ xZS )TransformerBlockr  c                    t                                                       t          |          | _        t	          |          | _        t          |j        d          | _        t          |j        d          | _	        d S )Nr  r  )
rg   rh   r  	attentionr  feed_forwardr    r*  attention_normffn_normr  s     r`   rh   zTransformerBlock.__init__  si    "4'--%d&6DAAA 0d;;;r_   r  r  r  rd   c                     | j                             |                     |          ||          }||z   }| j                            |                     |                    }||z   }|S Nr  r  r  rR  r  r  r  )ri   r  r  r  rrR   r  s          r`   rR  zTransformerBlock.forward  sp     N"""" # 
 
 E%%dmmA&6&677!e
r_   r  r   s   @r`   r  r    s        <. < < < < < << l <	
 
       r_   r  c                   j     e Zd Zdef fdZdej        dej        dej        dz  dej        fdZ xZS )	Transformerr  c                 
   t                                                       t          j                                        | _        t          |j                  D ])}| j                            t          |                     *d S rf   )
rg   rh   r[   r  
ModuleListlayersranger  r   r  )ri   r  r   rj   s      r`   rh   zTransformer.__init__  st    h))++t-.. 	7 	7AK/556666	7 	7r_   r  r  r  Nrd   c                 6    | j         D ]} ||||          }|S r  )r  )ri   r  r  r  layers        r`   rR  zTransformer.forward  s2     [ 	9 	9Eadi888AAr_   r  r   s   @r`   r  r    s        7. 7 7 7 7 7 7< l <$&	
 
       r_   r  patch_embeds_listc                 B    t          j        d | D                       }|S )Nc                     g | ]{}t          j        t          j        t          j        |j        d                    t          j        |j        d                   d          d                              dd          |S )r   ij)indexingr  r>  )r[   stackmeshgridr  r;  r  r   ps     r`   r=  z%position_meshgrid.<locals>.<listcomp>  s     
	
 
	
 
	
  KL--L--!  
    gb!nn
	
 
	
 
	
r_   )r[   r   )r  rL  s     r`   position_meshgridr  	  s;     	
	
 
	
 '
	
 
	
 
	
 I r_   c                        e Zd Zdef fdZedefd            Zedej	        j
        fd            Zedej        fd            Zedej        fd            Zdeej                 dej        fd	Z xZS )
r'  r  c                 t   t                                                       || _        t          |j        |j        |j        |j        d          | _        t          |j        d          | _	        t          |          | _        | j        j        | j        j        z  }|dz  dk    s
J d            d | _        d S )NFin_channelsout_channelskernel_sizestrider  r  r  r>  r   zROPE requires even head_dim)rg   rh   r  r   r  r*  r   
patch_convr    ln_prer  transformerr  
_freqs_cis)ri   r  r  rj   s      r`   rh   zVisionTransformer.__init__  s    	%))?
 
 
 d.D999&t,,9(DI,II!|q   "?   /3r_   rd   c                 4    | j         j        | j         j        z  S rf   )r  r   r   rt   s    r`   max_patches_per_sidez&VisionTransformer.max_patches_per_side.  s    y#ty';;;r_   c                 N    t          |                                           j        S rf   )next
parametersr  rt   s    r`   r  zVisionTransformer.device2  s    DOO%%&&--r_   c                 N    t          |                                           j        S rf   )r  r  r  rt   s    r`   r  zVisionTransformer.dtype6  s    DOO%%&&,,r_   c                    | j         Ct          | j        j        | j        j        z  | j        | j        | j        j                  | _         | j         j        | j        k    r%| j                             | j                  | _         | j         S )N)r  r   r   r  r  )	r  r  r  r*  r  r	  r  r  tort   s    r`   r  zVisionTransformer.freqs_cis:  s~    ?"5I)TY-JJ0/i*	  DO ?!T[00"o000DDDOr_   rU   c                 h     fd|D             }d |D             }d |D             }t          j        |d          }                     |          }t          |                               j                  } j        |dddf         |dddf         f         }t          r4t          j	        j
        j                            d |D                       }ndd	lm}  |d
 |D             |          }                     |||          }	t          j        |	                    d          |          S )a  
        Args:
            images: list of N_img images of variable sizes,
                each of shape (C, H, W)
        Returns:
            image_features: tensor of token features for
                all tokens of all images of shape (N_toks, D)
        c                     g | ]B}                     |                    d                               j                            CS r9  r  	unsqueezer  r  r   rw   ri   s     r`   r=  z-VisionTransformer.forward.<locals>.<listcomp>V  N     
 
 
ADDOOCMM!,,//
;;<<
 
 
r_   c                 b    g | ],}|                     d                               dd d          -S r>  r   r=   r  permuter  s     r`   r=  z-VisionTransformer.forward.<locals>.<listcomp>Z  4    QQQ!		!,,Q155QQQr_   c                 (    g | ]}|j         d          S r  r:  r  s     r`   r=  z-VisionTransformer.forward.<locals>.<listcomp>[      888aqwqz888r_   r=   r  Nr   c                 D    g | ]}|j         d          |j         d         z  S r  r   r:  r  s     r`   r=  z-VisionTransformer.forward.<locals>.<listcomp>h  )    FFFqqwr{*FFFr_   generate_block_attention_maskc                 D    g | ]}|j         d          |j         d         z  S r  r:  r  s     r`   r=  z-VisionTransformer.forward.<locals>.<listcomp>p  r  r_   r  )r[   r   r  r  r  r  r  r  r  fmhar  BlockDiagonalMaskfrom_seqlens,transformers.models.pixtral.modeling_pixtralr!  r  rD  squeeze)
ri   rU   r  patch_embedsembed_sizesrL  r  r  r!  r  s
   `         r`   rR  zVisionTransformer.forwardI  s   
 
 
 
HN
 
 
 RQ?PQQQ88<888 y1555{{<00 &&788;;DKHH	N9QQQT?IaaadO#CD	  	9&8EEFF4EFFF DD      10FF4EFFF D |$)LL {3;;q>>;777r_   )rV   rW   rX   r  rh   r   r   r	  r[   typesDevicer  r  r\   r  r]   rR  r   r   s   @r`   r'  r'    s       4. 4 4 4 4 4 4" <c < < < X< .* . . . X. -u{ - - - X- 5<    X,8U\",8 
,8 ,8 ,8 ,8 ,8 ,8 ,8 ,8r_   r'  c                   L     e Zd Zdedef fdZdej        dej        fdZ xZ	S )r0  r  r  c                 4   t                                                       t          |t                    sJ t	          j        |j        ||j                  | _        t	          j	                    | _
        t	          j        |||j                  | _        d S )Nr  )rg   rh   rn   r  r  r  r*  r  w_inGELUgeluw_out)ri   r  r  rj   s      r`   rh   zVisionLanguageAdapter.__init__y  s    $ 122222I"
 
 
	
 GII	YsCd.?@@@


r_   r  rd   c                 x    |                      |                     |                     |                              S rf   )r1  r0  r.  r  s     r`   rR  zVisionLanguageAdapter.forward  s*    zz$))DIIaLL11222r_   )
rV   rW   rX   r  r   rh   r[   r\   rR  r   r   s   @r`   r0  r0  x  sz        	A. 	AS 	A 	A 	A 	A 	A 	A3 3%, 3 3 3 3 3 3 3 3r_   r0  c            	            e Zd ZdZ	 ddedededdf fdZd	ej        d
e	e
eef                  dej        fdZd	ej        d
e	e
eef                  dej        fdZ xZS )r.  z<
    Learned merging of spatial_merge_size ** 2 patches
    Fr  r  r  rd   Nc                     t                                                       ||dz  z  }|| _        || _        t	          j        |||          | _        d S )Nr>  r  )rg   rh   r  mlp_input_dimr  r  merging_layer)ri   r  r  r  r5  rj   s        r`   rh   zPatchMerger.__init__  sc     	*.@!.CD"4*Y
 
 
r_   r  rC  c                     t          d |D                       t          |          k    sJ |                     ||          }|                     |          }|S )Nc                     g | ]
\  }}||z  S r^   r^   r   rR   rS   s      r`   r=  z'PatchMerger.forward.<locals>.<listcomp>  s     222daAE222r_   )sumr   r  r6  )ri   r  rC  s      r`   rR  zPatchMerger.forward  sc     22k22233s1vv==== LLK(( q!! r_   c                     t          ||| j                  }g }|D ]J}|j        d         }|                    |                    d|                                                     Kt          j        |d          S )a  
        Args:
            x: (N, D) where N is flattened and concatenated patch tokens
                for all images
            image_sizes: list of tuple of (height, width) in tokens for
                each image
        Returns:
            image_features: reorders patch tokens so each grid of
                (spatial_merge_size, spatial_merge_size) is contiguous.
                now (N / spatial_merge_size ** 2, D * spatial_merge_size ** 2)
        )r  rC  r  r   r   r  )get_sub_gridsr  r;  r   r  r   r[   r   )ri   r  rC  	sub_gridspermuted_tensorgrid	n_patchess          r`   r  zPatchMerger.permute  s    " "[T=T
 
 
	 /1 	 	D
2I""		"i((**,,    y
 
 
 	
r_   )F)rV   rW   rX   rY   r   r   rh   r[   r\   r]   r   rR  r  r   r   s   @r`   r.  r.    s          #	
 

  
 	

 

 
 
 
 
 
&,0sCx,A	    
<
 %S/*
 
	
 
 
 
 
 
 
 
r_   r.  rC  r  c                    d |D             }| j         d         }g }|}t          |                     |                    D ]\  }}||         \  }	}
|                    |	|
|                              ddd          d d d d d d d f         }t
          j        j                            |||          }|                    d|||d          }|	                    |d                    |S )Nc                     g | ]
\  }}||z  S r^   r^   r9  s      r`   r=  z!get_sub_grids.<locals>.<listcomp>  s     666$!QA666r_   r   r>  r   r=   )r  r  )
r;  r  rD  r  r  r[   r  r  unfoldr   )r  rC  r  tokens_per_imager  all_img_sub_gridssub_grid_sizeimage_indexr   rR   rS   
image_gridr=  s                r`   r<  r<    s    76+666	A,.&M%.qww7G/H/H%I%I / /!\;'1!&&q!Q//771a@@!!!QQQM

 H'..M- / 
 
	 NNq-
 
	 	  1....r_   c                   j    e Zd ZdededefdZdefdZdefdZdefdZdededeeef         fdZ	d	S )
PixtralHFEncoderInfor   r   rd   c                @    |                      ||          \  }}||z  S )N)r   r   )get_patch_grid_size)ri   r   r   r   r   s        r`   r   z)PixtralHFEncoderInfo.get_num_image_tokens  s4     //#% 0 
 
u u}r_   c                     | j         j        S rf   )r   r   rt   s    r`   r   z#PixtralHFEncoderInfo.get_image_size  s    !,,r_   c                 L    t          | j        dd          }| j        j        |z  S )Nr  r=   )r{  r  r   r   )ri   r  s     r`   get_patch_sizez#PixtralHFEncoderInfo.get_patch_size  s)    $T^5I1MM!,/AAAr_   c                 \    |                                  |                                 }}||z  S rf   )r   rO  )ri   r   r   s      r`   get_patch_grid_lengthz*PixtralHFEncoderInfo.get_patch_grid_length	  s0    !%!4!4!6!68K8K8M8MJ
 Z''r_   c                X   |                                  x}}|                                 x}}t          ||z  ||z            }|dk    rHt          t	          j        ||z                      }t          t	          j        ||z                      }t          ||f||f          \  }}	|	|fS )Nr=   )r   rO  maxr   mathfloor _get_pixtral_hf_num_image_tokens)
ri   r   r   	max_width
max_heightpatch_widthpatch_heightratior   r   s
             r`   rL  z(PixtralHFEncoderInfo.get_patch_grid_size  s     "&!4!4!6!66	J%)%8%8%:%::lK)+\J-FGG199dju)<==>>Ktz,*>??@@L7;';'
 
u
 e|r_   N)
rV   rW   rX   r   r   r   rO  rQ  r   rL  r^   r_   r`   rJ  rJ    s        
 
 	

 

 
 
 
- - - - -B B B B B
(s ( ( ( (  	
 
sCx     r_   rJ  c            
       d     e Zd Z	 ddddededz  deddf fdZd	ej        dej        fd
Z	 xZ
S )PixtralHFMLPNr   r  r  quant_configr  rd   c                L   t                                                       t                      }|j        J t	          |j        |j        gdz  d|| d|          | _        t          |j        |j        d|| d|          | _        t          |j
                  | _        d S )Nr>  F.gate_up_proj)
input_sizeoutput_sizesr  r^  r  
disable_tpz
.down_projra  output_sizer  r^  r  rc  )rg   rh   rG   r  r!   r*  gate_up_projr#   	down_projr   
hidden_actact_and_mulri   r  r^  r  use_data_parallelrj   s        r`   rh   zPixtralHFMLP.__init__)  s     	466'3336) 23a7%+++(
 
 
 +/*%((((
 
 
 .f.?@@r_   r  c                     |                      |          \  }}|                     |          }|                     |          \  }}|S rf   )rf  ri  rg  )ri   r  gate_upr   s       r`   rR  zPixtralHFMLP.forwardG  sF    &&q))
W%%~~a  1r_   rf   rV   rW   rX   r   r$   r   rh   r[   r\   rR  r   r   s   @r`   r]  r]  (  s         37A
 A A A#A )4/A
 A 
A A A A A A< %,        r_   r]  c                        e Zd Z	 ddddededz  deddf fdZd	ej        d
ej        dej        de	ej        ej        dz  f         fdZ
 xZS )PixtralHFAttentionNr   r  r  r^  r  rd   c          	         t                                                       || _        |j        |j        z  rJ |j        | _        |j        |j        z  | _        | j        | j        z  |j        k    sJ t                      }t          |j        | j        | j        d|| d|          | _	        t          |j        |j        d|| d|          | _        |rdnt                      | _        t          |j        | j                  | _        d S )NF	.qkv_proj)r*  	head_sizetotal_num_headsr  r^  r  rc  z.o_projrd  r=   )rg   rh   r  r*  r  rt  r  rG   r"   qkv_projr#   o_projr   tp_sizer   r  rj  s        r`   rh   zPixtralHFAttention.__init__O  s-    	%(BBBBB%9*f.HH#dm3v7IIIII466)*m 0%'''(
 
 
 ()*%%%%(
 
 
 #NAA(L(N(N 	 f8$,GGr_   rQ  attention_maskposition_embeddingsc                    |                                 \  }}}|                     |          \  }}|                    dd          \  }}	}
|                    ||| j        | j                                      dd          }|	                    ||| j        | j                                      dd          }	|
                    ||| j        | j                  }
|\  }}t          ||	||d          \  }}	t          ri|                    dd          	                                }|	                    dd          	                                }	t          j        ||	|
|          }nO|
                    dd          }
t          j                            ||	|
|	          }|                    dd          }|                    ||| j        | j        z            }|                     |          \  }}|d fS )
NrQ   r   r  r=   r>  r   )unsqueeze_dimr  r  )sizeru  chunkr  r  r  r  r   r  
contiguousr  r  r  r  r  r  rv  )ri   rQ  rx  ry  r  r  r   
qkv_statesr  r  r  cossinr  attn_outputs                  r`   rR  zPixtralHFAttention.forwardv  s    *..00wm44
A""1""--1a FF5'4<??II!QOOFF5'4<??II!QOOFF5'4<??&S#Aq#s!DDD1 
	&Aq!!,,..AAq!!,,..A1!Q^TTTCCAq!!A-<<1a> =  C --1%%Ckk%$,*FGGS))QD  r_   rf   )rV   rW   rX   r   r$   r   rh   r[   r\   r   rR  r   r   s   @r`   rp  rp  N  s         37%H
 %H %H %H#%H )4/%H
 %H 
%H %H %H %H %H %HN!!|!! !! #\	!!
 
u|U\D00	1!! !! !! !! !! !! !! !!r_   rp  c            
            e Zd Z	 ddddededz  deddf fdZd	ej        d
ej        dej        dej        fdZ	 xZ
S )PixtralHFTransformerBlockNr   r  r  r^  r  rd   c                   t                                                       t          |j        d          | _        t          ||| d          | _        t          ||| d          | _        t          |j        d          | _	        d S )Nr  r  z
.attention)r^  r  z.feed_forward)
rg   rh   r    r*  r  rp  r  r]  r  r  )ri   r  r^  r  rj   s       r`   rh   z"PixtralHFTransformerBlock.__init__  s     	%f&8dCCC+%(((
 
 

 )%+++
 
 

   2===r_   rQ  rx  ry  c                     | j                             |                     |          ||          \  }}||z   }| j                            |                     |                    }||z   }|S )N)rx  ry  r  )ri   rQ  rx  ry  r  r   rR   r  s           r`   rR  z!PixtralHFTransformerBlock.forward  sw     ~%%..) 3 & 
 
1
 A%%dmmA&6&677!e
r_   rf   rn  r   s   @r`   r  r    s         37>
 > > >#> )4/>
 > 
> > > > > >,|  #\	
 
       r_   r  c                        e Zd Z	 dddddededz  dedz  deddf
 fd	Zd
ej	        dej	        dej	        de
dej	        f
dZ xZS )PixtralHFTransformerNr   )num_hidden_layers_overrider  r  r^  r  r  rd   c                    t                                                       |j        }n|}t          j        fdt          |          D                       | _        d S )Nc           	      >    g | ]}t           d |           S )z.layers.)r  r^  r  )r  )r   	layer_idxr  r  r^  s     r`   r=  z1PixtralHFTransformer.__init__.<locals>.<listcomp>  sQ         *!!-$99i99    r_   )rg   rh   r  r  r  r  r  )ri   r  r^  r  r  r  rj   s    `` ` r`   rh   zPixtralHFTransformer.__init__  s     	%- & 8 :m      "''8!9!9  	
 	
r_   r  rx  ry  return_all_hidden_statesc                 p    |g}| j         D ]&} ||||          }|r|                    |           '|r|S |S rf   )r  r   )ri   r  rx  ry  r  hidden_states_poolr  s          r`   rR  zPixtralHFTransformer.forward  sc      S[ 	- 	-Ea)<==A' -"))!,,, $ 	&%%r_   rf   )rV   rW   rX   r   r$   r   r   rh   r[   r\   r   rR  r   r   s   @r`   r  r    s         37

 26
 
 
#
 )4/

 %($J
 
 

 
 
 
 
 
4<  #\	
 #' 
       r_   r  c                       e Zd Z	 ddddddededz  dedz  dedz  ded	df fd
Zdddde	e
j                 de	e         dz  dedz  d	ee
j        df         fdZdeeee
j        f                  d	ee         fdZ xZS )PixtralHFVisionModelNr   )r  require_post_normr  r  r^  r  r  r  rd   c                   t                                                       || _        t          |j        |j        |j        |j        d          | _        t          |j        d          | _	        t          |||| d          | _        |j        }t          | j        j                  |j        k    r-t          d| dt          | j        j                   d	          |d
u rd}t          |          t!          |                                           j        | _        t!          |                                           j        | _        t)          || j                  | _        d S )NFr  r  r  z.transformer)r^  r  r  zThe original encoder only has z layers, but you requested z layers.Tz1PixtralHFVisionModel does not have post-layernorm)rg   rh   r  r   r  r*  r   r  r    r  r  r  r  r   r  r   r  r  r  r  r   patch_positional_embedding)	ri   r  r^  r  r  r  r  msgrj   s	           r`   rh   zPixtralHFVisionModel.__init__  ss    	%++)$
 
 
 f0d;;;/%'A***	
 
 
 #4t&''&*BBB1B  -01A1H-I-I     $$ECS//!$//++,,2
4??,,--4*@*U*U'''r_   )select_layersfeature_select_strategyrN   r  r  .c                     fd|D             }d |D             }d |D             }t          j        |d          }                     |          }t          | j        j         j        j        z                                 j                  } 	                    ||          }t          r4t          j        j        j                            d |D                       }	ndd	lm}
  |
d
 |D             |          }	                     ||	||du          }t'          |d| j        j        |          }t          j        |                    d          |          S )a~  
        Args:
            pixel_values: Each image to be processed will be a separate tensor
                in pixel_values. This means it will be a list of tensors
                because multiple requests batched can have multiple images,
                each with their own shape potentially
            select_layers: Layer indices whose features should be
                concatenated and used as the visual encoder output. If none
                are provided, the last layer is used.

        Returns:
            image_features: tensor of token features for
                all tokens of all images of shape (N_toks, D)
        c                     g | ]B}                     |                    d                               j                            CS r9  r  r  s     r`   r=  z0PixtralHFVisionModel.forward.<locals>.<listcomp>4  r  r_   c                 b    g | ],}|                     d                               dd d          -S r  r  r  s     r`   r=  z0PixtralHFVisionModel.forward.<locals>.<listcomp>8  r  r_   c                 (    g | ]}|j         d          S r  r:  r  s     r`   r=  z0PixtralHFVisionModel.forward.<locals>.<listcomp>9  r  r_   r=   r  )rW  c                 D    g | ]}|j         d          |j         d         z  S r  r:  r  s     r`   r=  z0PixtralHFVisionModel.forward.<locals>.<listcomp>H  r  r_   r   r   c                 D    g | ]}|j         d          |j         d         z  S r  r:  r  s     r`   r=  z0PixtralHFVisionModel.forward.<locals>.<listcomp>P  r  r_   N)r  )r  max_possible_layersr  )r[   r   r  r   r  r   r   r  r  r  r  r  r#  r  r$  r%  r&  r!  r  rH   r  rD  r'  )ri   rN   r  r  r  r(  r)  position_idsposition_embeddingrx  r!  r  s   `           r`   rR  zPixtralHFVisionModel.forward  s   ,
 
 
 
HT
 
 
 RQ?PQQQ88<888 y1555{{<00 0k,0FF
 
 
 "T[// 	 "<<\<XX 	!Y0BOOFF4EFFF NN      ;:FF4EFFF N %2$%>	  
 
 -' $ =$;
 
 
 {3;;q>>;777r_   rU  c                 6   g d}t          |                                           }t                      }t          | j        j                  }|D ]\  }}|                    d          r/t          |                    d          d                   }||k    rI|D ]>\  }	}
}|
|vr|	                    |
|	          }||         }|j
        } ||||            n*||         }t          |dt                    } |||           |                    |           |S )N))rr  z.q_projr  )rr  z.k_projr  )rr  z.v_projr  )r`  z
.gate_projr   )r`  z.up_projr=   ztransformer.layersre  r>  weight_loader)r   rp  setr   r  r  r  r   rD  replacer  r{  r%   add)ri   rU  stacked_params_mappingparams_dictloaded_paramslayer_countr  loaded_weightr  
param_nameweight_nameshard_idrj  r  s                 r`   rq  z!PixtralHFVisionModel.load_weightsg  sK   "
 "
 "
 4002233"%%%$*122#* 	$ 	$D-344 

3 233	++5K 4 41
Kd**||K<<#D) % 3e]H===#D) '@U V Ve]333d####r_   rf   )rV   rW   rX   r   r$   r   r   r   rh   r]   r[   r\   rF   r   rR  r   r  rq  r   r   s   @r`   r  r    sb        37*V
 26)-*V *V *V#*V )4/*V
 %($J*V  $;*V *V 
*V *V *V *V *V *V` +/FJE8 E8 E85<(E8 Cy4'	E8
 "=t!CE8 
u|S 	!E8 E8 E8 E8R!HU33D-E$F !3s8 ! ! ! ! ! ! ! !r_   r  )rT  collections.abcr   r   r   dataclassesr   r   	functoolsr   typingr	   r
   r[   torch.nnr  torch.nn.functionalr  r  &mistral_common.protocol.instruct.chunkr   r   )mistral_common.protocol.instruct.messagesr   (mistral_common.protocol.instruct.requestr   +mistral_common.tokens.tokenizers.multimodalr   PILr   transformersr   r   r   transformers.image_utilsr   4transformers.models.pixtral.image_processing_pixtralr   rV  r&  r   r   r   $transformers.tokenization_utils_baser   vllm.configr   vllm.config.multimodalr   vllm.distributedr   r   %vllm.model_executor.layers.activationr   vllm.model_executor.layers.convr   $vllm.model_executor.layers.layernormr    !vllm.model_executor.layers.linearr!   r"   r#   'vllm.model_executor.layers.quantizationr$   -vllm.model_executor.model_loader.weight_utilsr%   vllm.multimodalr&   r'   vllm.multimodal.inputsr(   r)   r*   r+   vllm.multimodal.parser,   r-   r.   vllm.multimodal.processingr/   r0   $vllm.multimodal.processing.processorr1   r2   r3   r4   r5   r6   vllm.platformsr7   vllm.sequencer8   vllm.tokenizersr9   vllm.tokenizers.mistralr:   vllm.utils.tensor_schemar;   r<   
interfacesr>   r?   r@   rA   module_mappingrB   utilsrC   rD   visionrE   rF   rG   rH   xformersrI   r  is_cudahas_device_capabilityr  ImportErrorr-  rM   rb   r   r   r   register_processorModuler   r  r\   r  r   r  r  r   r  r  r  r  r  r]   r  r'  r0  r.  r<  rJ  r]  rp  r  r  r  r^   r_   r`   <module>r     s
    7 7 7 7 7 7 7 7 7 7 ) ) ) ) ) ) ) ) % % % % % % % % % % % % % %                 H H H H H H H H A A A A A A J J J J J J D D D D D D       F F F F F F F F F F / / / / / /              
 ; : : : : : " " " " " " 3 3 3 3 3 3 I I I I I I I I D D D D D D 7 7 7 7 7 7 8 8 8 8 8 8         
 G F F F F F O O O O O O F F F F F F F F            V U U U U U U U U U N N N N N N N N                , + + + + + - - - - - - 8 8 8 8 8 8 4 4 4 4 4 4 > > > > > > > >            + * * * * * ; ; ; ; ; ; ; ;           
$$$$$$!!  &L&6&LS&Q&Q           l   &T
 T
 T
 T
 T
 T
 T
 T
n.F .F .F .F .F. .F .F .Fb7
 7
 7
 7
 7
 67L M 7
 7
 7
t:) :) :) :) :)!89N!O :) :) :)z ('	*  
f4 f4 f4 f4 f4I|/f4 f4 
f4T         "el "u| " " " " "<	<< < 	<
 \< < < <8222 |2 5<%&	2 2 2 2	8 	8 	8 	8 	8") 	8 	8 	8& & & & &	 & & &R    ry   .    ")   $EL)
\   &Y8 Y8 Y8 Y8 Y8	 Y8 Y8 Y8x3 3 3 3 3BI 3 3 3 D
 D
 D
 D
 D
") D
 D
 D
N|eCHo&  
%,	   J1 1 1 1 1,-@A 1 1 1h# # # # #29 # # #LI! I! I! I! I! I! I! I!X% % % % %	 % % %P, , , , ,29 , , ,^W W W W W29 W W W W Ws   7*E" "E,+E,