
    .`iEB                     t   d Z ddlmZmZmZ ddlmZmZ ddlZddlm	Z	 ddl
mZmZ ddlmZ ddlmZ dd	lmZ dd
lmZ ddlmZ ddlmZ ddlmZmZ ddlmZ ddlmZ ddl m!Z! ddl"m#Z#m$Z$m%Z% ddl&m'Z'm(Z(m)Z) ddl*m+Z+m,Z,m-Z-m.Z.m/Z/m0Z0 ddl1m2Z2 ddl3m4Z4m5Z5 ddl6m7Z7m8Z8m9Z9 ddl:m;Z; ddl<m=Z=m>Z>m?Z?m@Z@  G d de4          ZA G d de	jB                  ZC G d de-          ZD G d  d!e+eD                   ZE G d" d#e,eD                   ZF e!jG        eFeDeE$           G d% d&e	jB        e8e9                      ZHdS )'zJCommand-A-Vision (Cohere2Vision) multimodal model implementation for vLLM.    )IterableMappingSequence)	AnnotatedLiteralN)nn)BatchFeaturePretrainedConfig)Cohere2VisionConfig)get_optimal_tiled_canvas)Cohere2VisionProcessor)
VllmConfig)BaseDummyOptions)
MulAndSilu)MergedColumnParallelLinearRowParallelLinear)QuantizationConfig)	AWQConfig)MULTIMODAL_REGISTRY)MultiModalDataDictMultiModalFieldConfigMultiModalKwargsItems)ImageProcessorItems	ImageSizeMultiModalDataItems)BaseDummyInputsBuilderBaseMultiModalProcessorBaseProcessingInfoPromptReplacementPromptUpdatePromptUpdateDetails)IntermediateTensors)TensorSchemaTensorShape   )MultiModalEmbeddingsSupportsMultiModal
SupportsPP)SiglipVisionModel)AutoWeightsLoaderWeightsMapperinit_vllm_registered_modelmaybe_prefixc                       e Zd ZU dZed         ed<   eej         e	dddd          f         ed<   eej         e	d          f         ed	<   d
S )Cohere2VisionImagePixelInputsa  
    Dimensions:
        - np: The total number of patches over each image over each prompt in
              the batch
        - c: Number of channels
        - h: Height of each image patch
        - w: Width of each image patch
        - bn: Batch size * number of images
    pixel_valuestypenp   hwbnnum_patchesN)
__name__
__module____qualname____doc__r   __annotations__r   torchTensorr$        }/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/vllm/model_executor/models/cohere2_vision.pyr/   r/   9   s           .
!!!!D!S#&&	(   
 D	     r@   r/   c                   X     e Zd ZdZd
dedef fdZd Zdej	        dej	        fd	Z
 xZS ) Cohere2VisionMultiModalProjectorzMultimodal projector that maps vision features to text embedding space.

    Uses pixel shuffle downsampling followed by SwiGLU activation.
     configprefixc                 r   t                                                       |j        | _        |j        j        |j        dz  z  }|j        dz  | _        t          || j        gdz  dd| d          | _        t                      | _
        t          | j        |j        j        dd| d          | _        d S )N   TFz	.linear_1)biasreturn_biasrF   z	.linear_2)super__init__downsample_factorvision_confighidden_sizealignment_intermediate_sizeintermediate_sizer   linear_1r   actr   text_configlinear_2)selfrE   rF   	input_dim	__class__s       rA   rL   z)Cohere2VisionMultiModalProjector.__init__W   s    !'!9 (48PRS8ST	 "(!Cq!H2#$q('''
 
 
 <<)"*'''
 
 
r@   c                     |                      |          }|                     |          }|                     |          }|                     |          }|S N)pixel_shufflerR   rS   rU   )rV   image_featureshidden_statess      rA   forwardz(Cohere2VisionMultiModalProjector.forwardt   sN    ++N;;n55//m44r@   r\   returnc                    t          |j        d         dz            x}}|                    |j        d         ||d          }|                                \  }}}}d| j        z  }	t          ||	z            }
t          ||	z            }|                    ||
| j        || j        |          }|                    dddddd	                                          }|                    ||
|d          }|S )
zApply pixel shuffle downsampling to reduce spatial dimensions.

        Args:
            image_features: Input tensor of shape [B, S, D] where S = H*W

        Returns:
            Downsampled tensor with increased channel dimension
        r%   g      ?r   g      ?r3   rH         )intshapereshapesizerM   permute
contiguous)rV   r\   heightwidthxnr4   r5   cscale_factornhnws               rA   r[   z.Cohere2VisionMultiModalProjector.pixel_shuffle{   s     ^1!4;<<<"">#7#:E62NNVVXX
1aT33\!""\!""IIaT3R9OQRSSIIaAq!Q''2244IIaR$$r@   )rD   )r8   r9   r:   r;   r   strrL   r^   r=   r>   r[   __classcell__rX   s   @rA   rC   rC   Q   s         

 
2 
C 
 
 
 
 
 
:  EL U\        r@   rC   c                       e Zd ZdefdZdedefdZdefdZde	e
edz  f         fdZdefdZd	ed
ededz  defdZdS )Cohere2VisionProcessingInfor_   c                 @    | j                             t                    S rZ   )ctxget_hf_configr   rV   s    rA   ry   z)Cohere2VisionProcessingInfo.get_hf_config   s    x%%&9:::r@   kwargsc                 2     | j         j        t          fi |S rZ   )rx   get_hf_processorr   rV   r{   s     rA   r}   z,Cohere2VisionProcessingInfo.get_hf_processor   s     (tx()?JJ6JJJr@   c                 &     | j         di |j        S Nr?   )r}   image_processorr~   s     rA   get_image_processorz/Cohere2VisionProcessingInfo.get_image_processor   s    $t$..v..>>r@   Nc                 
    dd iS )Nimager?   rz   s    rA   get_supported_mm_limitsz3Cohere2VisionProcessingInfo.get_supported_mm_limits   s    r@   c                     |                                  }|j        d         }|j        d         }|j        }t          ||z  |          S )Nrj   rk   )rj   rk   )r   rg   max_patchesr   )rV   r   rj   rk   r   s        rA   !get_image_size_with_most_featuresz=Cohere2VisionProcessingInfo.get_image_size_with_most_features   sM    2244 %h/$W-%1 4EBBBBr@   image_widthimage_height	processorc                    ||                                  }|j        }|j        }|j        }|j        }|j        }|sdS t          ||f|d         |d         f||          \  }	}
|	|
z  }|dk    r|dz  }|S )z
        Calculate the number of image patches for a given image.
        Uses the HF processor to determine the actual number of patches.
        Nr%   rj   rk   )r}   r   min_patchesr   rg   crop_to_patchesr   )rV   r   r   r   r   r   r   
patch_sizer   num_columnsnum_rowsr7   s               rA   get_num_patchesz+Cohere2VisionProcessingInfo.get_num_patches   s     --//I#3 &1%1$)
)9 	1 8;'!:g#67	!
 !
X "H,??1Kr@   )r8   r9   r:   r   ry   objectr   r}   r   r   rr   rd   r   r   r   r   r?   r@   rA   rv   rv      s        ;2 ; ; ; ;K K4J K K K K?F ? ? ? ?cDj)A    C9 C C C C) ) 	)
 *D0) 
) ) ) ) ) )r@   rv   c            	       p    e Zd Zdeeef         defdZ	 ddedeeef         deeef         dz  defdZ	dS )	Cohere2VisionDummyInputsBuilder	mm_countsr_   c                 x    |                     dd          }| j                                        }|j        }||z  S )Nr   r   )getinfor}   image_token)rV   r   
num_imagesr   r   s        rA   get_dummy_textz.Cohere2VisionDummyInputsBuilder.get_dummy_text   s;    ]]7A..
I..00	+Z''r@   Nseq_len
mm_optionsc                     |                     dd          }| j                                        }|r|                     d          nd }d|                     |j        |j        ||          iS )Nr   r   )rk   rj   r   	overrides)r   r   r   _get_dummy_imagesrk   rj   )rV   r   r   r   r   
image_sizeimage_overridess          rA   get_dummy_mm_dataz1Cohere2VisionDummyInputsBuilder.get_dummy_mm_data   s}     ]]7A..
Y@@BB
5?I*..111T T++ &!(%)	 ,  
 	
r@   rZ   )
r8   r9   r:   r   rr   rd   r   r   r   r   r?   r@   rA   r   r      s        (S(9 (c ( ( ( ( =A	
 

 38$
 C!112T9	

 

 
 
 
 
 
r@   r   c            
            e Zd Zdedeeef         deeef         deeef         def
 fdZdedeeef         deeef         fd	Z	d
e
deeef         dedee         fdZ xZS ) Cohere2VisionMultiModalProcessorpromptmm_data	mm_kwargs
tok_kwargsr_   c                    	 t                                          ||||          }d|vr|                    d          x}  j        j        di |                                                     d|i                              dt                    		 fdt          t          	                    D             }t          j        |          |d<   |S )Nr7   imagesr   c                     g | ]O}j                                                 |          j                            |          j                   PS )r   r   r   )r   r   get_image_sizerk   rj   ).0ihf_processorparsed_imagesrV   s     rA   
<listcomp>zGCohere2VisionMultiModalProcessor._call_hf_processor.<locals>.<listcomp>  sm         	)) - < <Q ? ? E!.!=!=a!@!@!G* *    r@   r?   )rK   _call_hf_processorr   r   r}   _get_data_parserparse_mm_data	get_itemsr   rangelenr=   tensor)rV   r   r   r   r   processed_outputsr   r7   r   r   rX   s   `       @@rA   r   z3Cohere2VisionMultiModalProcessor._call_hf_processor   s    "GG66	
 
 !222";;x000=5495BB	BBL %%''0117$788       s=1122  K 05|K/H/Hm,  r@   	hf_inputshf_processor_mm_kwargsc                     |                     dt          j        d                    }t          t	          j        d|          t	          j        d          t	          j        d                    S )Nr7   r   r   )r0   r7   image_embeds)r   r=   emptydictr   flat_from_sizesbatched)rV   r   r   r7   s       rA   _get_mm_fields_configz6Cohere2VisionMultiModalProcessor._get_mm_fields_config  sb    
  mmM5;q>>BB.>wTT-5g>>.6w??
 
 
 	
r@   mm_itemsout_mm_kwargsc           	          	
   j         j        di |j        t          j        dz            
j        	j        j        dt          f	
 fd}t          d|          gS )NrH   item_idxc                                         dt                    }|                    |           }j                            |j        |j                  }	z  
z   } ||z    }t          j        |	          S )Nr   r   )	r   r   r   r   r   rk   rj   r!   select_text)r   r   r   r7   patch_tokensrepl	boi_token	eoi_tokenr   r   img_line_break_tokenimg_tokens_per_tiler   rV   s         rA   get_replacementzMCohere2VisionMultiModalProcessor._get_prompt_updates.<locals>.get_replacement4  s    ''1DEEF$*$9$9($C$CJ)33&,'.& 4  K
 ')<<?SSLH!;HYHHD&24EEEr@   r   )modalitytargetreplacementr?   )	r   r}   r   rd   r   r   r   r   r   )rV   r   r   r   r   r   r   r   r   r   r   s   ``   @@@@@@rA   _get_prompt_updatesz4Cohere2VisionMultiModalProcessor._get_prompt_updates'  s     2ty1KK4JKK".!,"91"<==+@ *	 *		Fc 	F 	F 	F 	F 	F 	F 	F 	F 	F 	F 	F 	F 	F  "+  
 	
r@   )r8   r9   r:   rr   r   r   r	   r   r   r   r   r   r   r    r   rs   rt   s   @rA   r   r      s       &!&! f%&! 3;'	&!
 CK(&! 
&! &! &! &! &! &!P



 !(V 4

 
++	,	

 

 

 

!
%!
 !(V 4!
 -	!

 
,	!
 !
 !
 !
 !
 !
 !
 !
r@   r   )r   dummy_inputsc                       e Zd Z eddddd          Zddd	ed
ef fdZed             Z	de
eeej        f                  dee         fdZdedeej                 fdZdededz  fdZdedefdZdedefdZ	 	 d dej        dej        dedz  dej        dz  dedej        ez  fdZdej        dej        dz  fdZ xZS )!%Cohere2VisionForConditionalGenerationzvision_tower.zmulti_modal_projector.zlanguage_model.model.zlanguage_model.lm_head.)zmodel.vision_tower.zmodel.multi_modal_projector.zmodel.language_model.zlm_head.)orig_to_new_prefixrD   rF   vllm_configrF   c          	         t                                                       |j        j        }|j        }|j        j        }|| _        || _        || _        |                     ||           |                     |d          5  t          |j
        |t          |d                    | _        t          |t          |d                    | _        d d d            n# 1 swxY w Y   |                     |          5  t!          ||j        t          |d          |j        j                  | _        d d d            d S # 1 swxY w Y   d S )Nr   vision_towerr   multi_modal_projectorlanguage_model)r   	hf_configrF   architectures)rK   rL   model_configr   quant_configmultimodal_configrE   _patch_quant_config_mark_tower_modelr)   rN   r-   r   rC   r   _mark_language_modelr,   rT   r   r   )rV   r   rF   rE   r   r   rX   s         rA   rL   z.Cohere2VisionForConditionalGeneration.__init__Z  s   &1&>&H"/'4F(!2  666##K99 	 	 1$#FN;;! ! !D
 *J|F4KLL* * *D&	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 &&{33 	 	"<' ,#F,<==$0>	# # #D	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	s%   ACC!$C!=6E  EEc                 N    t          |                                           j        S rZ   )next
parametersdtyperz   s    rA   r   z+Cohere2VisionForConditionalGeneration.dtypev  s    DOO%%&&,,r@   weightsr_   c                 X    t          |           }|                    || j                  S )N)mapper)r*   load_weightshf_to_vllm_mapper)rV   r   loaders      rA   r   z2Cohere2VisionForConditionalGeneration.load_weightsz  s+    "4((""743I"JJJr@   image_inputc                     |d         }|d         }|                      |          }|                     |          }d |                    |                                          D             S )a  Process image pixels through vision tower and projector.

        Args:
            image_input: Validated image input containing pixel values and
                         patch counts

        Returns:
            List of flattened image embeddings, one per image
        r0   r7   c                 :    g | ]}|                     d d          S )r   rH   )flatten)r   es     rA   r   zNCohere2VisionForConditionalGeneration._process_image_input.<locals>.<listcomp>  s$    RRRA		!QRRRr@   )r   r   splittolist)rV   r   r{   r0   r7   r\   r   s          rA   _process_image_inputz:Cohere2VisionForConditionalGeneration._process_image_input~  ss     #>2!-0 **<88 11.AA SR););K<N<N<P<P)Q)QRRRRr@   r{   Nc                    |                     dd           }|                     dd           }|                     dd           }|
J d            |d S t          d||| j        j        j        | j        j        j        d          S )Nr0   r7   r   z,Cohere2Vision does not support image_embeds.)r4   r5   )r1   r0   r7   resolve_bindings)popr/   rE   rN   r   )rV   r{   r0   r7   r   s        rA   _parse_and_validate_image_inputzECohere2VisionForConditionalGeneration._parse_and_validate_image_input  s     zz.$77jj55zz.$77##%S###4,%#[.9[.9 	
 
 
 	
r@   rE   r   c                     t          |t                    r=|j        }t          |dd           }|j        s | |j                            d           d S d S d S d S )Nquantization_configr   )
isinstancer   rT   getattrmodules_to_not_convertappend)rV   rE   r   rT   llm_quant_configs        rA   r   z9Cohere2VisionForConditionalGeneration._patch_quant_config  s    
 lI.. 	K ,K&{4I4PP 7 K ,3::>JJJJJ	K 	KK K,,r@   c                 @     | j         di |}|g S  | j        |fi |S r   )r   r   )rV   r{   r   s      rA   embed_multimodalz6Cohere2VisionForConditionalGeneration.embed_multimodal  s@    :d:DDVDDI(t(?????r@   	input_ids	positionsintermediate_tensorsinputs_embedsc                 J    |d }| j                             ||||          }|S )N)r  r	  r
  r  )r   model)rV   r  r	  r
  r  r{   r]   s          rA   r^   z-Cohere2VisionForConditionalGeneration.forward  s@      + M+11!5'	 2 
 
 r@   r]   c                 6    | j                             |          S rZ   )r   compute_logits)rV   r]   s     rA   r  z4Cohere2VisionForConditionalGeneration.compute_logits  s     "11-@@@r@   )NN)r8   r9   r:   r+   r   r   rr   rL   propertyr   r   tupler=   r>   setr   r/   listr   r   r   r
   r   r   r&   r  r"   r^   r  rs   rt   s   @rA   r   r   K  s=        &#2,D%<1	
 
   BD   z 3      8 - - X-KHU33D-E$F K3s8 K K K KS8S	el	S S S S0

	&	-
 
 
 
*K&K6HK K K K@ @4H @ @ @ @ <@-1 < < 2D8	
 |d*  
+	+   &A|A 
	A A A A A A A Ar@   r   )Ir;   collections.abcr   r   r   typingr   r   r=   r   transformersr	   r
   "transformers.models.cohere2_visionr   Gtransformers.models.cohere2_vision.image_processing_cohere2_vision_fastr   <transformers.models.cohere2_vision.processing_cohere2_visionr   vllm.configr   vllm.config.multimodalr   %vllm.model_executor.layers.activationr   !vllm.model_executor.layers.linearr   r   'vllm.model_executor.layers.quantizationr   +vllm.model_executor.layers.quantization.awqr   vllm.multimodalr   vllm.multimodal.inputsr   r   r   vllm.multimodal.parser   r   r   vllm.multimodal.processingr   r   r   r   r    r!   vllm.sequencer"   vllm.utils.tensor_schemar#   r$   
interfacesr&   r'   r(   siglipr)   utilsr*   r+   r,   r-   r/   ModulerC   rv   r   r   register_processorr   r?   r@   rA   <module>r+     s   Q P 7 7 7 7 7 7 7 7 7 7 % % % % % % % %        7 7 7 7 7 7 7 7 B B B B B B           # " " " " " 3 3 3 3 3 3 < < < < < <        G F F F F F A A A A A A / / / / / /         
 V U U U U U U U U U                . - - - - - > > > > > > > > L L L L L L L L L L % % % % % %               L   0< < < < <ry < < <~= = = = ="4 = = =@
 
 
 
 
67
 
 
@X
 X
 X
 X
 X
78X
 X
 X
v ('$	$0  
FA FA FA FA FABI7I: FA FA 
FA FA FAr@   