
    .`i?                     h   d dl mZmZmZ d dlmZmZ d dlZd dlmZ d dl	m
Z
mZ d dlmZ d dlmZ d dlmZ d d	lmZ d d
lmZ d dlmZ d dlmZ d dlmZ d dlmZmZmZ d dl m!Z!m"Z"m#Z# d dl$m%Z%m&Z&m'Z'm(Z(m)Z)m*Z* d dl+m,Z, d dl-m.Z.m/Z/ ddl0m1Z1m2Z2m3Z3 ddl4m5Z5 ddl6m7Z7m8Z8m9Z9m:Z:m;Z;  G d de.          Z< G d dej=                  Z> G d de'          Z? G d de%e?                   Z@ G d d e&e?                   ZAd!ed"eBfd#ZC ejD        eAe?e@$           G d% d&ej=        e2e3                      ZEdS )'    )IterableMappingSequence)	AnnotatedLiteralN)nn)BatchFeatureGotOcr2ImageProcessor)ACT2FN)get_size_dict)AyaVisionConfig)AyaVisionProcessor)get_optimal_tiled_canvas)
VllmConfig)BaseDummyOptions)MULTIMODAL_REGISTRY)MultiModalDataDictMultiModalFieldConfigMultiModalKwargsItems)ImageProcessorItems	ImageSizeMultiModalDataItems)BaseDummyInputsBuilderBaseMultiModalProcessorBaseProcessingInfoPromptReplacementPromptUpdatePromptUpdateDetails)IntermediateTensors)TensorSchemaTensorShape   )MultiModalEmbeddingsSupportsMultiModal
SupportsPP)SiglipVisionModel)AutoWeightsLoaderWeightsMapperget_layer_indexinit_vllm_registered_modelmaybe_prefixc                       e Zd ZU dZed         ed<   eej         e	dddd          f         ed<   eej         e	d          f         ed	<   d
S )AyaVisionImagePixelInputsa  
    Dimensions:
        - np: The total number of patches over each image over each prompt in
              the batch
        - c: Number of channels
        - h: Height of each image patch
        - w: Width of each image patch
        - bn: Batch size * number of images
    pixel_valuestypenp   hwbnnum_patchesN)
__name__
__module____qualname____doc__r   __annotations__r   torchTensorr!        y/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/vllm/model_executor/models/aya_vision.pyr-   r-   1   s           .
!!!!D!S#&&	(   
 D	     r>   r-   c                   l     e Zd Zdef fdZdej        dej        fdZdej        dej        fdZ xZ	S )AyaVisionMultiModalProjectorconfigc                    t                                                       || _        |j        | _        t	          |d|j        j                  | _        t          j	        |j
        j        |j        dz  z  |j                  | _        t          j        |j
        j        |j        dz  z  | j        d          | _        t          d         | _        t          j        | j        dz  |j        j        d          | _        d S )Nalignment_intermediate_size   )epsT)biassilu)super__init__rB   downsample_factorgetattrtext_confighidden_sizerD   r   	LayerNormvision_configadapter_layer_norm_eps	layernormLinearlinear_1r   actlinear_2)selfrB   	__class__s     r?   rJ   z%AyaVisionMultiModalProjector.__init__J   s    !'!9+2163E3Q,
 ,
(  ,0H!0KL-
 
 

 	 ,0H!0KL,
 
 
 &>	,1*
 
 
r>   image_featuresreturnc                    |                      |          }|                     |          }|                     |          }|                    dd          \  }}|                     |          |z  }|                     |          }|S )NrE   )dim)pixel_shufflerR   rT   chunkrU   rV   )rW   rY   hidden_statesxgates        r?   forwardz$AyaVisionMultiModalProjector.forwardd   s    ++N;;77n55  %%aR%004*m44r>   c           	         |j         \  }}}t          |dz            x}}|                    |j         d         ||d          }|j         d         }|                    ||t          || j        z            t          || j        z                      }|                    dddd          }|                    |t          || j        z            t          || j        z            d          }|                    dddd          }|S )Ng      ?r   r\   rE   r"   r1   )shapeintreshaperK   permute)rW   rY   
batch_size
seq_length_heightwidthchannelss           r?   r^   z*AyaVisionMultiModalProjector.pixel_shufflep   s   $2$8!
JZ_---'// #UFB
 
 "'+'////0041122	
 
 (//1a;;'////00..//	
 
 (//1a;;r>   )
r6   r7   r8   r   rJ   r;   r<   rc   r^   __classcell__rX   s   @r?   rA   rA   I   s        
 
 
 
 
 
 
4
el 
u| 
 
 
 
EL U\        r>   rA   c                       e Zd ZdefdZdedefdZdedefdZ	de
eedz  f         fdZdefdZd	ed
ededededefdZdS )AyaVisionProcessingInforZ   c                 @    | j                             t                    S N)ctxget_hf_configr   rW   s    r?   rv   z%AyaVisionProcessingInfo.get_hf_config   s    x%%o666r>   kwargsc                 2     | j         j        t          fi |S rt   )ru   get_hf_processorr   rW   rx   s     r?   rz   z(AyaVisionProcessingInfo.get_hf_processor   s     (tx();FFvFFFr>   c                 &     | j         di |j        S Nr=   )rz   image_processorr{   s     r?   get_image_processorz+AyaVisionProcessingInfo.get_image_processor   s    $t$..v..>>r>   Nc                 
    dd iS )Nimager=   rw   s    r?   get_supported_mm_limitsz/AyaVisionProcessingInfo.get_supported_mm_limits   s    r>   c                     |                                  }|j        d         }|j        d         }|j        }t          ||z  ||z            S )Nrl   rm   )rl   rm   )r   sizemax_patchesr   )rW   r~   rl   rm   r   s        r?   !get_image_size_with_most_featuresz9AyaVisionProcessingInfo.get_image_size_with_most_features   sR    2244 %h/$W-%1 4EK<OPPPPr>   image_widthimage_heightr   min_patchesr   c                    t          |d          }t          ||f|d         |d         f||          \  }}||z  }|dk    r|n|dz   S )z
        Calculate the number of patches needed for a given image based on size
        constraints.  This method replicates and adjusts the logic from:
        transformers/models/got_ocr2/image_processing_got_ocr2
        F)default_to_squarerl   rm   r"   )r   r   )	rW   r   r   r   r   r   num_columnsnum_rows
num_blockss	            r?   get_num_patchesz'AyaVisionProcessingInfo.get_num_patches   sj     TU;;; 8;'(^T']+	!
 !
X !8+
'1__zz*q.@r>   )r6   r7   r8   r   rv   objectr   rz   r
   r   r   strrf   r   r   r   dictr   r=   r>   r?   rr   rr      s       7 7 7 7 7G G4F G G G G?F ?7L ? ? ? ?cDj)A    Q9 Q Q Q QA A 	A
 A A A 
A A A A A Ar>   rr   c            	       p    e Zd Zdeeef         defdZ	 ddedeeef         deeef         dz  defdZ	dS )	AyaVisionDummyInputsBuilder	mm_countsrZ   c                 x    |                     dd          }| j                                        }|j        }||z  S )Nr   r   )getinforz   image_token)rW   r   
num_images	processorr   s        r?   get_dummy_textz*AyaVisionDummyInputsBuilder.get_dummy_text   s;    ]]7A..
I..00	+Z''r>   Nseq_len
mm_optionsc                     |                     dd          }| j                                        }|r|                     d          nd }d|                     |j        |j        ||          iS )Nr   r   )rm   rl   r   	overrides)r   r   r   _get_dummy_imagesrm   rl   )rW   r   r   r   r   
image_sizeimage_overridess          r?   get_dummy_mm_dataz-AyaVisionDummyInputsBuilder.get_dummy_mm_data   s}     ]]7A..
Y@@BB
5?I*..111T T++ &!(%)	 ,  
 	
r>   rt   )
r6   r7   r8   r   r   rf   r   r   r   r   r=   r>   r?   r   r      s        (S(9 (c ( ( ( ( =A	
 

 38$
 C!112T9	

 

 
 
 
 
 
r>   r   c            
            e Zd Zdedeeef         deeef         deeef         def
 fdZdedeeef         deeef         fd	Z	d
e
deeef         dedee         fdZ xZS )AyaVisionMultiModalProcessorpromptmm_data	mm_kwargs
tok_kwargsrZ   c                    
 t                                          ||||          }  j        j        di |}|j        
|                    d          x}                                                     d|i                              dt                    fdt          t                              D             }
 fd|D             }	t          j        |	          |d<   |S )Nimagesr   c                 :    g | ]}                     |          S r=   )get_image_size).0iparsed_imagess     r?   
<listcomp>zCAyaVisionMultiModalProcessor._call_hf_processor.<locals>.<listcomp>   s4       45,,Q//  r>   c           	          g | ]:}j                             |j        |j        j        j        j                   ;S )r   r   r   r   r   )r   r   rm   rl   r   r   r   )r   r   r~   rW   s     r?   r   zCAyaVisionMultiModalProcessor._call_hf_processor.<locals>.<listcomp>   s`     	 	 	  	)) * 0!+!2(- / ; / ; *  	 	 	r>   r5   r=   )rI   _call_hf_processorr   rz   r~   r   _get_data_parserparse_mm_data	get_itemsr   rangelenr;   tensor)rW   r   r   r   r   processed_outputshf_processorr   image_sizesr5   r~   r   rX   s   `         @@r?   r   z/AyaVisionMultiModalProcessor._call_hf_processor   s%    "GG66	
 
 2ty1>>I>>&6 kk(+++F8%%''0117$788 
   9>s=?Q?Q9R9R  K	 	 	 	 	 #.	 	 	K 05|K/H/Hm,  r>   	hf_inputshf_processor_mm_kwargsc                     |                     dt          j        d                    }t          t	          j        d|          t	          j        d          t	          j        d                    S )Nr5   r   r   )r.   r5   image_embeds)r   r;   emptyr   r   flat_from_sizesbatched)rW   r   r   r5   s       r?   _get_mm_fields_configz2AyaVisionMultiModalProcessor._get_mm_fields_config   sb    
  mmM5;q>>BB.>wTT-5g>>.6w??
 
 
 	
r>   mm_itemsout_mm_kwargsc                        j         j        di |j        }j        j        dt
          f fd}t          d||          gS )Nitem_idxc                 *                        dt                    }|                    |           }	j                            |j        |j        j        j        j	                  }
                    |          }t          j        |          S )Nr   r   )r5   )r   r   r   r   r   rm   rl   r   r   r   _prompt_split_imager   select_text)
r   r   r   r5   replr   r~   img_patch_tokenr   rW   s
        r?   get_replacementzIAyaVisionMultiModalProcessor._get_prompt_updates.<locals>.get_replacement  s    ''1DEEF$*$9$9($C$CJ)33&,'.$)+7+7 4  K  333LLD&24IIIr>   r   )modalitytargetreplacementr=   )r   rz   r   r   r~   rf   r   )	rW   r   r   r   r   r   r   r~   r   s	   ``    @@@r?   _get_prompt_updatesz0AyaVisionMultiModalProcessor._get_prompt_updates	  s     2ty1KK4JKK".&6&6	Jc 	J 	J 	J 	J 	J 	J 	J 	J 	J 	J  "+  
 	
r>   )r6   r7   r8   r   r   r   r	   r   r   r   r   r   r   r   r   ro   rp   s   @r?   r   r      s       '!'! f%'! 3;'	'!
 CK('! 
'! '! '! '! '! '!R



 !(V 4

 
++	,	

 

 

 


%
 !(V 4
 -	

 
,	
 
 
 
 
 
 
 
r>   r   	hf_configrZ   c                 "   | j         }| j        j        t          |t                    rt          |          S t          |t          t          f          rt          fd|D                       S t          dt          |           d          )Nc              3   8   K   | ]}t          |          V  d S rt   )r)   )r   idxnum_hidden_layerss     r?   	<genexpr>z)_get_num_hidden_layers.<locals>.<genexpr>3  s.      UUs?3(9::UUUUUUr>   zvision_layer_feature type: z is not supported)vision_feature_layerrP   r   
isinstancerf   r)   listtuplemax	TypeErrorr/   )r   feature_layersr   s     @r?   _get_num_hidden_layersr   +  s    3N!/A.#&& V~/@AAA	NT5M	2	2 VUUUUnUUUUUU
Md>&:&:MMM  r>   )r   dummy_inputsc                       e Zd Z eddddd          Zededed	ed
z  fd            Zddde	def fdZ
ed             Zdeeeej        f                  d	ee         fdZdedej        d	ej        eej        df         z  fdZded	eej                 fdZded	ed
z  fdZded	efdZ	 	 d$dej        dej        ded
z  d ej        d
z  ded	ej        ez  fd!Zd"ej        d	ej        d
z  fd#Z xZS )%!AyaVisionForConditionalGenerationzlanguage_model.model.zvision_tower.zmulti_modal_projector.zlanguage_model.lm_head.)zmodel.language_model.zmodel.vision_tower.zmodel.multi_modal_projector.zlm_head.)orig_to_new_prefixr   r   rZ   Nc                 N    |                     d          rdS t          d          )Nr   z<image>z Only image modality is supported)
startswith
ValueError)clsr   r   s      r?   get_placeholder_strz5AyaVisionForConditionalGeneration.get_placeholder_strI  s,    w'' 	9;<<<r>    )prefixvllm_configr   c          
      \   t                                                       |j        j        }|j        }|j        j        }t          |          }|| _        || _        || _        |                     |d          5  t          |j
        ||t          |d                    | _        t          |          | _        d d d            n# 1 swxY w Y   |                     |          5  t!          ||j        t          |d          dg          | _        d d d            d S # 1 swxY w Y   d S )Nr   vision_model)num_hidden_layers_overrider   modelCohere2ForCausalLM)r   r   r   architectures)rI   rJ   model_configr   quant_configmultimodal_configr   rB   _mark_tower_modelr&   rP   r+   vision_towerrA   multi_modal_projector_mark_language_modelr*   rM   language_model)rW   r   r   rB   r   r   r   rX   s          r?   rJ   z*AyaVisionForConditionalGeneration.__init__P  s   "-":"D"/'4F26::(!2##K99 	N 	N 1$+<#FN;;	! ! !D *Ff)M)MD&	N 	N 	N 	N 	N 	N 	N 	N 	N 	N 	N 	N 	N 	N 	N &&{33 	 	"<' ,#FG4434# # #D	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	s%   ;A CCC'-D!!D%(D%c                 N    t          |                                           j        S rt   )next
parametersdtyperw   s    r?   r   z'AyaVisionForConditionalGeneration.dtypel  s    DOO%%&&,,r>   weightsc                 X    t          |           }|                    || j                  S )N)mapper)r'   load_weightshf_to_vllm_mapper)rW   r  loaders      r?   r  z.AyaVisionForConditionalGeneration.load_weightsp  s+    "4((""743I"JJJr>   r   r.   .c                 b     ||                     |j                  | j        j                  S )N)r   )feature_select_strategy)tor   rB   vision_feature_select_strategy)rW   r   r.   s      r?   _image_pixels_to_featuresz;AyaVisionForConditionalGeneration._image_pixels_to_featurest  s9    
 |OO,"4O55$(K$N
 
 
 	
r>   image_inputc                     |d         }|d         }|                      | j        |          }|                     |          }d |                    |                                          D             S )Nr.   r5   )r.   c                 :    g | ]}|                     d d          S )r   rE   )flatten)r   es     r?   r   zJAyaVisionForConditionalGeneration._process_image_input.<locals>.<listcomp>  s$    RRRA		!QRRRr>   )r  r   r   splittolist)rW   r  rx   r.   r5   rY   r   s          r?   _process_image_inputz6AyaVisionForConditionalGeneration._process_image_input~  s}     #>2!-077L 8 
 
 11.AARR););K<N<N<P<P)Q)QRRRRr>   rx   c                    |                     dd           }|                     dd           }|                     dd           }|
J d            |d S t          d||| j        j        j        | j        j        j        d          S )Nr.   r5   r   z)Aya Vision does not support image_embeds.)r2   r3   )r/   r.   r5   resolve_bindings)popr-   rB   rP   r   )rW   rx   r.   r5   r   s        r?   _parse_and_validate_image_inputzAAyaVisionForConditionalGeneration._parse_and_validate_image_input  s     zz.$77jj55zz.$77##%P###4(%#[.9[.9 	
 
 
 	
r>   c                 @     | j         di |}|g S  | j        |fi |S r}   )r  r  )rW   rx   r  s      r?   embed_multimodalz2AyaVisionForConditionalGeneration.embed_multimodal  s@    :d:DDVDDI(t(?????r>   	input_ids	positionsintermediate_tensorsinputs_embedsc                 J    |d }| j                             ||||          }|S )N)r  r  r  r  )r   r   )rW   r  r  r  r  rx   r`   s          r?   rc   z)AyaVisionForConditionalGeneration.forward  s@      + M+11!5'	 2 
 
 r>   r`   c                 6    | j                             |          S rt   )r   compute_logits)rW   r`   s     r?   r   z0AyaVisionForConditionalGeneration.compute_logits  s     "11-@@@r>   )NN) r6   r7   r8   r(   r  classmethodr   rf   r   r   rJ   propertyr   r   r   r;   r<   setr  r&   r  r-   r   r  r   r  r#   r  r   rc   r   ro   rp   s   @r?   r   r   9  s        & &=#2,D1
 
   =3 =3 =3: = = = [= BD   z 3      8 - - X-KHU33D-E$F K3s8 K K K K
'
 l
 
elC/0	0	
 
 
 
	S4	S	el		S 	S 	S 	S

	"T	)
 
 
 
*@ @4H @ @ @ @ <@-1 < < 2D8	
 |d*  
+	+   &A|A 
	A A A A A A A Ar>   r   )Fcollections.abcr   r   r   typingr   r   r;   r   transformersr	   r
   transformers.activationsr   #transformers.image_processing_utilsr   transformers.models.aya_visionr   4transformers.models.aya_vision.processing_aya_visionr   6transformers.models.got_ocr2.image_processing_got_ocr2r   vllm.configr   vllm.config.multimodalr   vllm.multimodalr   vllm.multimodal.inputsr   r   r   vllm.multimodal.parser   r   r   vllm.multimodal.processingr   r   r   r   r   r   vllm.sequencer   vllm.utils.tensor_schemar    r!   
interfacesr#   r$   r%   siglipr&   utilsr'   r(   r)   r*   r+   r-   ModulerA   rr   r   r   rf   r   register_processorr   r=   r>   r?   <module>r9     s   8 7 7 7 7 7 7 7 7 7 % % % % % % % %        < < < < < < < < + + + + + + = = = = = = : : : : : : S S S S S S      # " " " " " 3 3 3 3 3 3 / / / / / /         
 V U U U U U U U U U                . - - - - - > > > > > > > > L L L L L L L L L L % % % % % %                    0< < < < <29 < < <~*A *A *A *A *A0 *A *A *AZ
 
 
 
 
"89P"Q 
 
 
<U
 U
 U
 U
 U
#:;R#S U
 U
 U
po #     (' 	 ,  
~A ~A ~A ~A ~A	3Ez ~A ~A 
~A ~A ~Ar>   