
    .`iQ              	          U d Z ddlmZmZmZ ddlmZmZmZ ddl	Z	ddl
mZ ddlmZ ddlmZ ddlmZ ddlmZ dd	lmZmZ dd
lmZ ddlmZ ddlmZmZmZ ddlm Z  ddl!m"Z"m#Z#m$Z$m%Z% ddl&m'Z' ddl(m)Z) ddl*m+Z+ ddl,m-Z-m.Z.m/Z/m0Z0 ddl1m2Z2 ddl3m4Z4m5Z5m6Z6m7Z7m8Z8  ee9          Z: G d de+          Z;e;Z<ee=d<    G d dej>                  Z? G d dej>                  Z@ G d de$          ZA G d d e"eA                   ZB G d! d"e#eA                   ZC ejD        eCeAeB#           G d$ d%ej>        e/e.e0                      ZEdS )&zInference-only BAGEL model compatible with HuggingFace weights.

BAGEL is a unified multimodal model for image understanding and generation.
For vLLM, we focus on the image understanding (vision-to-text) capabilities.
    )IterableMappingSequence)AnyLiteral	TypeAliasN)
VllmConfig)BaseDummyOptions)init_logger)
get_act_fn)ColumnParallelLinearRowParallelLinear)QuantizationConfig)MULTIMODAL_REGISTRY)MultiModalDataDictMultiModalFieldConfigMultiModalKwargsItems)MultiModalDataItems)BaseDummyInputsBuilderBaseMultiModalProcessorBaseProcessingInfoPromptReplacement)IntermediateTensors)BagelProcessor)TensorSchema   )MultiModalEmbeddingsSupportsLoRASupportsMultiModal
SupportsPP)SiglipVisionModel)AutoWeightsLoaderStageMissingLayerWeightsMapperinit_vllm_registered_modelmaybe_prefixc                   >    e Zd ZU dZed         ed<   ej        ed<   dS )BagelImagePixelInputsz
    Dimensions:
        - bn: Batch size * number of images
        - c: Number of channels (3)
        - h: Height of each image
        - w: Width of each image
    pixel_valuestypeN)__name__
__module____qualname____doc__r   __annotations__torchTensor     t/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/vllm/model_executor/models/bagel.pyr(   r(   <   s=           .
!!!!,r3   r(   BagelImageInputsc                   n     e Zd ZdZ	 	 	 ddedededed	edz  d
ef fdZdej	        dej	        fdZ
 xZS )BagelVisionMLPz"MLP connector for vision features.gelu_pytorch_tanhN in_featureshidden_featuresout_features	act_layerquant_configprefixc                     t                                                       t          ||d|| d          | _        t	          |          | _        t          ||d|| d          | _        d S )NTz.fc1)biasr>   r?   z.fc2)super__init__r   fc1r   actr   fc2)selfr:   r;   r<   r=   r>   r?   	__class__s          r4   rC   zBagelVisionMLP.__init__O   s     	'%???
 
 
 i(($%???
 
 
r3   xreturnc                     |                      |          \  }}|                     |          }|                     |          \  }}|S N)rD   rE   rF   )rG   rI   _s      r4   forwardzBagelVisionMLP.forwardi   s<    xx{{1HHQKKxx{{1r3   )r8   Nr9   )r+   r,   r-   r.   intstrr   rC   r0   r1   rN   __classcell__rH   s   @r4   r7   r7   L   s        ,, -26
 

 
 	

 
 )4/
 
 
 
 
 
 
4 %,        r3   r7   c                        e Zd ZdZdedef fdZededefd            Zedefd            Zedefd	            Z	d
e
j        de
j        fdZ xZS )PositionEmbeddingzA2D position embedding for vision tokens using sin-cos embeddings.max_num_patch_per_sidehidden_sizec                    t                                                       || _        || _        |                     ||          }|                     dt          j        |                                          d           d S )N	pos_embedF)
persistent)	rB   rC   rU   rV   _get_2d_sincos_pos_embedregister_bufferr0   
from_numpyfloat)rG   rU   rV   rX   rH   s       r4   rC   zPositionEmbedding.__init__s   s    &<#& 11+?UVV	Y''--// 	 	
 	
 	
 	
 	
r3   	embed_dim	grid_sizec                 @   ddl }|                    ||j                  }|                    ||j                  }|                    ||          }|                    |d          }|                    dd||g          }t                              | |          }|S )z(Generate 2D sin-cos position embeddings.r   Ndtypeaxis   r   )numpyarangefloat32meshgridstackreshaperT   "_get_2d_sincos_pos_embed_from_grid)r^   r_   npgrid_hgrid_wgridrX   s          r4   rZ   z*PositionEmbedding._get_2d_sincos_pos_embed   s     	9BJ779BJ77{{66**xx1x%%||Q9i899%HHt
 
	 r3   c                     ddl }| dz  dk    sJ t                              | dz  |d                   }t                              | dz  |d                   }|                    ||gd          }|S )z2Generate 2D sin-cos position embeddings from grid.r   Nre   r   rc   )rf   rT   "_get_1d_sincos_pos_embed_from_gridconcatenate)r^   rp   rm   emb_hemb_wembs         r4   rl   z4PositionEmbedding._get_2d_sincos_pos_embed_from_grid   s     	1}!!!!!DDNDG
 
 "DDNDG
 
 nneU^!n44
r3   c                 `   ddl }| dz  dk    sJ |                    | dz  |j                  }|| dz  z  }dd|z  z  }|                    d          }|                    d	||          }|                    |          }|                    |          }|                    ||gd
          }|S )z(Generate 1D sin-cos position embeddings.r   Nre   ra   g       @g      ?i'  zm,d->mdr   rc   )rf   rg   float64rk   einsumsincosrs   )r^   posrm   omegaoutemb_sinemb_cosrv   s           r4   rr   z4PositionEmbedding._get_1d_sincos_pos_embed_from_grid   s     	1}!!!!		)q.
	;;S eUl"kk"ooii	3..&&++&&++nngw/an88
r3   position_idsrJ   c                 Z    |                     | j        j                  }| j        |         S )z
        Args:
            position_ids: Flattened position IDs, shape (N,) where each ID
                         corresponds to a position in the flattened grid
        Returns:
            Position embeddings of shape (N, hidden_size)
        )torX   device)rG   r   s     r4   rN   zPositionEmbedding.forward   s'     $t~'<==~l++r3   )r+   r,   r-   r.   rO   rC   staticmethodrZ   rl   rr   r0   r1   rN   rQ   rR   s   @r4   rT   rT   p   s        KK
s 
 
 
 
 
 
 
 C C    \ c    \ c    \"
,EL 
,U\ 
, 
, 
, 
, 
, 
, 
, 
,r3   rT   c                       e Zd ZdZdedefdZdeee	dz  f         fdZ
de	deee	f         deee	f         fd	Zd
e	de	de	fdZdS )BagelProcessingInfoz'Processing information for BAGEL model.kwargsrJ   c                     ddl m}  || j        j        j        | j        j        j        | j        j        j                  }|                                 }t          d||d|S )Nr   )cached_get_image_processor)revisiontrust_remote_code)image_processor	tokenizerr2   )	!vllm.transformers_utils.processorr   ctxmodel_configmodelr   r   get_tokenizerr   )rG   r   r   r   r   s        r4   get_hf_processorz$BagelProcessingInfo.get_hf_processor   s    PPPPPP44H!'X*3"h3E
 
 
 &&((	 
+
 
 
 
 	
r3   Nc                 
    dd iS )Nimager2   )rG   s    r4   get_supported_mm_limitsz+BagelProcessingInfo.get_supported_mm_limits   s    r3   seq_len	mm_countsc                 F    |                                  }|j        dz  }d|iS )Nre   r   )get_hf_configvit_max_num_patch_per_side)rG   r   r   	hf_configmax_num_patchess        r4   get_mm_max_tokens_per_itemz.BagelProcessingInfo.get_mm_max_tokens_per_item   s/    
 &&((	 $>A))r3   image_widthimage_heightc                d    |                                  }|j        }|j        }||z  }||z  }||z  S rL   )r   
vit_config
patch_size)rG   r   r   r   r   r   num_patches_hnum_patches_ws           r4   get_num_image_tokensz(BagelProcessingInfo.get_num_image_tokens   sF     &&((	)
*
 %
2#z1},,r3   )r+   r,   r-   r.   objectr   r   r   rP   rO   r   r   r   r2   r3   r4   r   r      s        11
 
N 
 
 
 
"cDj)A    	*	* 38$	* 
c			* 	* 	* 	*- - 	-
 
- - - - - -r3   r   c            	       t    e Zd ZdZdeeef         defdZ	 d	dedeeef         deeef         dz  de	fdZ
dS )
BagelDummyInputsBuilderz-Build dummy inputs for BAGEL model profiling.r   rJ   c                 8    |                     dd          }d|z  S )Nr   r   <|image_pad|>)get)rG   r   
num_imagess      r4   get_dummy_textz&BagelDummyInputsBuilder.get_dummy_text   s     ]]7A..
++r3   Nr   
mm_optionsc                     |                     dd          }| j                                        }|j        }|j        }|r|                     d          nd }d|                     ||||          iS )Nr   r   )widthheightr   	overrides)r   infor   r   
image_size_get_dummy_images)	rG   r   r   r   r   r   r   r   image_overridess	            r4   get_dummy_mm_dataz)BagelDummyInputsBuilder.get_dummy_mm_data   s     ]]7A..
I++--	)
  *
5?I*..111T T++ !%)	 ,  
 	
r3   rL   )r+   r,   r-   r.   r   rP   rO   r   r
   r   r   r2   r3   r4   r   r      s        77,S(9 ,c , , , , =A	
 

 38$
 C!112T9	

 

 
 
 
 
 
r3   r   c            
           e Zd ZdZdededeeef         deeef         def
dZ	dedeee
f         dedee         fd	Zd
e
deeef         deeef         fdZdS )BagelMultiModalProcessorz%Multimodal processor for BAGEL model.prompt_textmm_itemshf_processor_mm_kwargstokenization_kwargsrJ   c                     dS )NFr2   )rG   r   r   r   r   s        r4   _hf_processor_applies_updatesz6BagelMultiModalProcessor._hf_processor_applies_updates  s	     ur3   out_mm_kwargsc                    | j                                         | j                                         }|                                                    d          t          d          dt          ffd}t          dg|          gS )z=Replace image placeholders with the correct number of tokens.r   Nz=Image token '<|image_pad|>' not found in tokenizer vocabularyitem_idxc                 $    j         dz  }g|z  S )Nre   )r   )r   
num_tokensr   image_token_ids     r4   get_replacement_bagelzKBagelMultiModalProcessor._get_prompt_updates.<locals>.get_replacement_bagel-  s    "=q@J"#j00r3   r   )modalitytargetreplacement)r   r   r   	get_vocabr   
ValueErrorrO   r   )rG   r   r   r   r   r   r   r   s         @@r4   _get_prompt_updatesz,BagelMultiModalProcessor._get_prompt_updates  s     I++--	 I++--	",,..22?CC!O  	1C 	1 	1 	1 	1 	1 	1 	1  &'1  
 	
r3   	hf_inputsc                 .    dt          j        d          iS )Nr)   r   )r   batched)rG   r   r   s      r4   _get_mm_fields_configz.BagelMultiModalProcessor._get_mm_fields_config;  s     19'BB
 	
r3   N)r+   r,   r-   r.   rP   r   r   r   boolr   r   r   r   r   r   r   r   r2   r3   r4   r   r     s        // & !(V 4	
 %S&[1 
   
%
 !(S 1
 -	

 
#	$
 
 
 
>

 !(V 4
 
++	,	
 
 
 
 
 
r3   r   )r   dummy_inputsc                       e Zd ZdZ eddddd          Zeded	ed
edz  fd            Z	ddde
def fdZded
edz  fdZded
eej        df         fdZded
efdZ	 	 d!dej        dej        dedz  dej        dz  ded
ej        ez  fdZdej        d
ej        dz  fdZdeeeej        f                  d
ee         fd Z xZS )"BagelForConditionalGenerationz
    BAGEL: A unified multimodal model for image understanding and generation.

    For vLLM, we focus on the image understanding (vision-to-text) capabilities.
    The image generation part is not supported in vLLM.
    language_model.
vit_model.
connector.vit_pos_embed.)r   r   r   r   )orig_to_new_prefixr   irJ   Nc                 N    |                     d          rdS t          d          )Nr   r   z Only image modality is supported)
startswithr   )clsr   r   s      r4   get_placeholder_strz1BagelForConditionalGeneration.get_placeholder_str^  s,    w'' 	#"?;<<<r3   r9   )r?   vllm_configr?   c                   t                                                       |j        j        }|j        }|j        j        }t          |          j        dk    r%t          dt          |          j         d          || _	        || _        | 
                    |          5  t          ||j        t          |d          dg          | _        d d d            n# 1 swxY w Y   |j        r|j        }|j        dk    r!t$                              d           d	|_        t)          |d
          s!t$                              d           d|_        |                     |d          5  t/          ||t          |d                    | _        |j        j        }|j        j        }t5          ||||j        |t          |d                    | _        t;          |j        |          | _        d d d            n# 1 swxY w Y   n<tA          d          | _        tA          d          | _        tA          d          | _        | j        j!        | _!        d S )NBagelConfigzExpected BagelConfig, got z0. Make sure the model config is properly loaded.language_modelQwen2ForCausalLM)r   r   r?   architectures   zZOverriding vit_config.num_hidden_layers from 27 to 26 to match the Bagel model checkpoint.   vision_use_headz_Setting vit_config.vision_use_head to False as it is not present in the Bagel model checkpoint.Fr   	vit_model)configr>   r?   	connector)r:   r;   r<   r=   r>   r?   )rU   rV   image_tower)"rB   rC   r   r   r>   multimodal_configr*   r+   r   r   _mark_language_modelr%   
llm_configr&   r   
visual_undr   num_hidden_layersloggerwarninghasattrr   _mark_tower_modelr!   r   rV   r7   connector_actr   rT   r   vit_pos_embedr#   make_empty_intermediate_tensors)
rG   r   r?   r   r>   r   r   vit_hidden_sizellm_hidden_sizerH   s
            r4   rC   z&BagelForConditionalGeneration.__init__e  s   )3"/'4F << M11AT&\\-B A A A  
 !2 &&{33 	 	"<' +#F,<==12	# # #D	 	 	 	 	 	 	 	 	 	 	 	 	 	 	  -	B  *J+r11;   02
,:'899 3=   .3
*''W==  !2%!-'<<" " " #)"3"?"("3"?!/ /$3!0$2!-'<<" " " &7+1+L /& & &"+              4 /}==DN.}==DN!2=!A!AD ? 	,,,s%   !-CC!C'BG99G= G=r   c                 X    |                     dd           }|d S t          d|          S )Nr)   )r*   r)   )popr(   )rG   r   r)   s      r4   _parse_and_validate_image_inputz=BagelForConditionalGeneration._parse_and_validate_image_input  s?     zz.$774$%
 
 
 	
r3   image_input.c                 "   |d         }|j         dk    r(|j        \  }}}}}|                    ||z  |||          }|                     |          }|                     |          }	|	j        \  }}
}| j        j        j        }| j        j        j        }||z  }t          j
        ||	j                  }t          j
        ||	j                  }|dddf         | j        j        z  |z                                   }|                    d                              |d                                          }|                     |          }|                    ||
|          }|                    |	j                  }|	|z   }	t%          |	          S )z:Process image inputs through vision encoder and connector.r)      )r   Nr   rx   )ndimshaperk   r   r   r   r   r   r   r0   rg   r   r   flatten	unsqueezeexpandr   r   tuple)rG   r   r)   
batch_sizer   channelsr   r   vision_featuresvision_embedsnum_patchesrV   r   r   num_patches_per_sideh_coordsw_coordsr   
pos_embedss                      r4   _process_image_inputz2BagelForConditionalGeneration._process_image_input  s    #>2
 !!>J>P;J
Hfe'//Z'65 L ..66 77 0=/B,
K[+6
[+6
  *Z7 < 4]=QRRR< 4]=QRRRQQQW FFQ
')) 	 $--a0077
BGGOOQQ ''55
''
KMM
]]=#788
%
2 ]###r3   c                 N     | j         di |}|g S |                     |          S )z%Get multimodal embeddings from input.Nr2   )r   r  )rG   r   r   s      r4   embed_multimodalz.BagelForConditionalGeneration.embed_multimodal  s9    :d:DDVDDI((555r3   	input_ids	positionsintermediate_tensorsinputs_embedsc                 J    |d}| j                             ||||          }|S )an  Run forward pass for BAGEL.

        Args:
            input_ids: Flattened (concatenated) input_ids corresponding to a batch.
            positions: Flattened (concatenated) position ids corresponding to a batch.
            intermediate_tensors: Intermediate tensors from prior forward pass.
            inputs_embeds: Optional tensor of input embeddings.
        N)r  r  r  r  )r   r   )rG   r  r  r  r  r   hidden_statess          r4   rN   z%BagelForConditionalGeneration.forward  s@       + M+11!5'	 2 
 
 r3   r  c                 6    | j                             |          S rL   )r   compute_logits)rG   r  s     r4   r  z,BagelForConditionalGeneration.compute_logits  s     "11-@@@r3   weightsc                 T   g d}ddg}g }|D ]\  }t          fd|D                       r!t          fd|D                       r=dv r|j        dk    r|j        d         }|j        d	         }| j        j        j        }| j        j        j        }	||	|z  |z  k    rB|                    ||||	          }|                    dd
d	d          	                                }|
                    |f           t          | dg          }
|
                    || j                  S )zLoad weights from checkpoint.)moe_genlatent_pos_embedllm2vaevae2llmtime_embedderzdecoder.zencoder.c              3       K   | ]}|v V  	d S rL   r2   ).0skipnames     r4   	<genexpr>z=BagelForConditionalGeneration.load_weights.<locals>.<genexpr>4  s'      @@D44<@@@@@@r3   c              3   B   K   | ]}                     |          V  d S rL   )r   )r%  r?   r'  s     r4   r(  z=BagelForConditionalGeneration.load_weights.<locals>.<genexpr>6  s/      FFv4??6**FFFFFFr3   zpatch_embedding.weightre   r   r      zvit_pos_embed.pos_embed)skip_prefixes)mapper)anyr  r  r   r   r   num_channelsrk   permute
contiguousappendr"   load_weightshf_to_vllm_mapper)rG   r  generation_keywordsvae_prefixesfiltered_weightstensorout_channelsr:   r   in_channelsloaderr'  s              @r4   r2  z*BagelForConditionalGeneration.load_weights  s}   
 
 
 
 # 	4 	4LD&@@@@,?@@@@@ FFFFFFFFF '4//FK14D4D%|A$l1o![3>
"k4A+
":Z"GGG#^^$j*k F $^^Aq!Q77BBDDF##T6N3333 #48Q7RSSS""#3D<R"SSSr3   )NN)r+   r,   r-   r.   r$   r3  classmethodrP   rO   r   r	   rC   r   r5   r   r  r0   r1   r  r   r  r   rN   r  r   setr2  rQ   rR   s   @r4   r   r   E  s@         &0&&.	
 
   =3 =3 =3: = = = [= BD N
 N
 N
z N
3 N
 N
 N
 N
 N
 N
`

	D	 
 
 
 
0$+0$	u|S 	!0$ 0$ 0$ 0$d6 64H 6 6 6 6 <@-1 < < 2D8	
 |d*  
+	+   6A|A 
	A A A A+THU33D-E$F +T3s8 +T +T +T +T +T +T +T +Tr3   r   )Fr.   collections.abcr   r   r   typingr   r   r   r0   torch.nnnnvllm.configr	   vllm.config.multimodalr
   vllm.loggerr   %vllm.model_executor.layers.activationr   !vllm.model_executor.layers.linearr   r   'vllm.model_executor.layers.quantizationr   vllm.multimodalr   vllm.multimodal.inputsr   r   r   vllm.multimodal.parser   vllm.multimodal.processingr   r   r   r   vllm.sequencer   (vllm.transformers_utils.processors.bagelr   vllm.utils.tensor_schemar   
interfacesr   r   r   r    siglipr!   utilsr"   r#   r$   r%   r&   r+   r   r(   r5   r/   Moduler7   rT   r   r   r   register_processorr   r2   r3   r4   <module>rS     s     8 7 7 7 7 7 7 7 7 7 * * * * * * * * * *        " " " " " " 3 3 3 3 3 3 # # # # # # < < < < < <        G F F F F F / / / / / /         
 6 5 5 5 5 5            . - - - - - C C C C C C 1 1 1 1 1 1            & % % % % %              
X		
 
 
 
 
L 
 
 
 4 ) 3 3 3! ! ! ! !RY ! ! !HK, K, K, K, K,	 K, K, K,\/- /- /- /- /-, /- /- /-d
 
 
 
 
45HI 
 
 
@2
 2
 2
 2
 2
67JK 2
 2
 2
j ('	(  
~T ~T ~T ~T ~TI!<~T ~T 
~T ~T ~Tr3   