
    .`iq              
          d dl Z d dlZd dlmZmZmZ d dlmZmZ d dl	Z	d dl
mZ d dlmZ d dlmZ d dlmZ d dlmZ d dlmZmZmZ d d	lmZ d d
lmZ d dlmZ d dlmZm Z  d dl!m"Z" d dl#m$Z$ d dl%m&Z&m'Z'm(Z( d dl)m*Z*m+Z+m,Z, d dl-m.Z.m/Z/m0Z0m1Z1m2Z2 d dl3m4Z4 d dl5m6Z6m7Z7 ddl8m9Z9m:Z:m;Z;m<Z<m=Z= ddl>m?Z? ddl@mAZAmBZBmCZCmDZD  G d de6          ZEeEZF G d de0          ZG G d de.eG                   ZH G d de/eG                   ZI G d  d!ejJ                  ZK e$jL        eIeGeH"           G d# d$ejJ        e<e;e=e9                      ZMdS )%    N)IterableMappingSequence)	AnnotatedLiteral)BatchFeature)ACT2FN)Lfm2VlProcessor)Lfm2VlConfig)Lfm2VlImageProcessorFastfind_closest_aspect_ratioround_by_factor)
VllmConfig)BaseDummyOptions)set_forward_context)MambaStateDtypeCalculatorMambaStateShapeCalculator)MultiModelKeys)MULTIMODAL_REGISTRY)MultiModalDataDictMultiModalFieldConfigMultiModalKwargsItems)ImageProcessorItems	ImageSizeMultiModalDataItems)BaseDummyInputsBuilderBaseMultiModalProcessorBaseProcessingInfoPromptReplacementPromptUpdateDetails)IntermediateTensors)TensorSchemaTensorShape   )IsHybridMultiModalEmbeddingsSupportsLoRASupportsMultiModal
SupportsPP)Siglip2Model)AutoWeightsLoaderWeightsMapperinit_vllm_registered_modelmaybe_prefixc                       e Zd ZU dZdZed         ed<   eej	         e
ddd          f         ed<   eej	         e
dd          f         ed<   eej	         e
d	          f         ed
<   dS )Lfm2VLImagePixelInputsz
    Dimensions:
        - b: Number of images in the prompt
        - bn: Batch size * number of images
        - d: Number of dimensions
        - fd: Number of features per dimension
    pixel_valuestypebndfd   spatial_shapesbnum_patchesN)__name__
__module____qualname____doc__r2   r   __annotations__r   torchTensorr#        v/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/vllm/model_executor/models/lfm2_vl.pyr0   r0   >   s           %3D'.
!222EL++dC*F*FFGGGGelKKa,@,@@AAAA5<S)9)99::::::rB   r0   c                      e Zd Zd Zd ZdedefdZdee	e
dz  f         fdZdefdZd	e
d
e
de
de
de
dedefdZd	e
d
e
de
de
de
de
dee
e
f         fdZde
de
deee
e
f                  fdZd	e
d
e
de
de
de
dee
e
f         fdZde
de
dedz  dee
e
f         fdZde
de
dedz  de
fdZde
de
dej        dedz  de	f
dZdej        dedz  dee
e
f         fdZdS )Lfm2VLProcessingInfoc                 @    | j                             t                    S N)ctxget_hf_configr   selfs    rC   rI   z"Lfm2VLProcessingInfo.get_hf_configQ   s    x%%l333rB   c                 2     | j         j        t          fi |S rG   )rH   get_hf_processorr
   rK   kwargss     rC   rM   z%Lfm2VLProcessingInfo.get_hf_processorT   s    (tx(CCFCCCrB   rO   returnc                 &     | j         di |j        S NrA   )rM   image_processorrN   s     rC   get_image_processorz(Lfm2VLProcessingInfo.get_image_processorW   s    $t$..v..>>rB   Nc                 
    dd iS )NimagerA   rJ   s    rC   get_supported_mm_limitsz,Lfm2VLProcessingInfo.get_supported_mm_limitsZ   s    rB   c                     |                                  }|j        }|j        }|j        }||dz  z  |dz  z  }t	          t          j        |                    }t          ||          S )Nr6   )widthheight)rT   max_image_tokensencoder_patch_sizedownsample_factorintmathsqrtr   )rK   	processorr[   r\   r]   
max_pixelssides          rC   !get_image_size_with_most_featuresz6Lfm2VLProcessingInfo.get_image_size_with_most_features]   st    ,,..	$5&9%7%);Q)>?CTVWCWX
49Z(())tD1111rB   rZ   rY   r[   r\   r]   max_pixels_tolerancec                     ||z  }t          |t          ||                    }t          |t          ||                    }	||	z  ||dz  z  |dz  z  |z  k    S )z<Check if the image is too large to be processed as one tile.r6   )maxr   )
rK   rZ   rY   r[   r\   r]   re   total_factorh_barw_bars
             rC   _is_image_too_largez(Lfm2VLProcessingInfo._is_image_too_largef   s{     *,==&(M(MNN&|(L(LMMEM !#$"# ###	
rB   min_image_tokensc                 r   ||z  }||dz  z  |dz  z  }||dz  z  |dz  z  }	t          |t          ||                    }
t          |t          ||                    }|
|z  |	k    rqt          j        ||z  |	z            }t          |t          j        ||z  |z            |z            }
t          |t          j        ||z  |z            |z            }n]|
|z  |k     rTt          j        |||z  z            }t          j        ||z  |z            |z  }
t          j        ||z  |z            |z  }||
fS )Nr6   )rg   r   r_   r`   floorceil)rK   rZ   rY   r]   rl   r[   r\   rh   smart_resize_min_pixelssmart_resize_max_pixelsri   rj   betas                rC   smart_resizez!Lfm2VLProcessingInfo.smart_resize|   su    *,==11447H!7KK 	  11447H!7KK 	  L/&,"G"GHHL/%"F"FGG5=2229fun0GGHHDdj$)EFFU E dj)DEET EE U]44494GHHDIftml:;;lJEIedl\9::\IEe|rB   	min_tiles	max_tilesc                     fdt          dz             D             }t          t          |          d           S )Nc                     g | ]E}t          d |d z             D ]/}t          d |d z             D ]}||z  cxk    rk    n n||f0FS )r$   )range).0nwhru   rt   s       rC   
<listcomp>z7Lfm2VLProcessingInfo._target_ratios.<locals>.<listcomp>   s     
 
 
1a!e__
 
 1a!e__	
 
 AE....Y.....	 F /....rB   r$   c                 $    | d         | d         z  S )Nr   r$   rA   )xs    rC   <lambda>z5Lfm2VLProcessingInfo._target_ratios.<locals>.<lambda>   s    1! rB   )key)rx   sortedset)rK   rt   ru   ratioss    `` rC   _target_ratiosz#Lfm2VLProcessingInfo._target_ratios   s^    
 
 
 
 
9i!m44
 
 
 c&kk'<'<====rB   	tile_sizec                 x    ||z  }|                      ||          }t          |||||          \  }}	||	z  }
||	|
fS rG   )r   r   )rK   rZ   rY   rt   ru   r   aspect_ratiotarget_ratios
grid_widthgrid_heighttotal_patchess              rC   _get_grid_layoutz%Lfm2VLProcessingInfo._get_grid_layout   sZ     v~++IyAA";-	#
 #

K #[0;55rB   image_widthimage_heightra   c                    ||                                  }|j        j        }|j        j        }|j        j        }|j        j        }|j        j        }|j        j        }	|j        j        }
||cxk    odk    nc  }| 	                    |||	|||          }|r!|r| 
                    |||||
          \  }}}ndx}x}}||z  dk    r|dz  }|||fS )Nr$   )rZ   rY   r[   r\   r]   re   )rt   ru   r   )rT   rS   r]   r\   re   rt   ru   r[   r   rk   r   )rK   r   r   ra   r]   r\   re   rt   ru   r[   r   do_image_splittingis_image_larger   r   r   s                   rC   _get_image_feature_grid_sizez1Lfm2VLProcessingInfo._get_image_feature_grid_size   s:    0022I%5G&6I(8M-7	-7	$4E-7	!*i!<!<!<!<1!<!<!<!<<11-1/!5 2 
 
  		90 		9595J5J### 6K 6 62J]] 898J8}#q((QM;55rB   c                >    |                      |||          \  }}}|S )Nr   r   ra   )r   )rK   r   r   ra   _r   s         rC   get_num_patchesz$Lfm2VLProcessingInfo.get_num_patches   s7     #??#% @ 
 
1m
 rB   r7   c                    ||                                  }d}|j        }|j        }|j        }|j        }	|                     ||          \  }
}|||z  z   |                     |||          \  }}dk    s|dk    r>fdt          |          D             }|
dk    r|                    |	||
z  z              n||
z  g}d	                    t          j        |g||g                    }|S )Nz<|img_row_{n_h}_col_{n_w}|>)r7   ra   r   r$   c                 n    g | ]1}t                    D ]}                    |d z   |d z              2S )r$   )n_hn_w)rx   format)ry   ijgrid_wtile_img_placeholders      rC   r}   z7Lfm2VLProcessingInfo.get_image_repl.<locals>.<listcomp>  sa     , , ,v, ,  %++A1q5+AA, , , ,rB   r    )rM   image_tokenimage_start_tokenimage_end_tokenimage_thumbnail_tokenget_num_image_tokensr   rx   appendjoin	itertoolschain)rK   r   r   r7   ra   grid_placeholderr   r   r   r   num_thumbnail_tokensnum_tokens_per_tilegrid_hr   tiles_placeholderplaceholderr   r   s                   @@rC   get_image_replz#Lfm2VLProcessingInfo.get_image_repl   sj    --//I8+%7#3 ) ?484M4M) 5N 5
 5
11  0;AT3TU ==#% > 
 
 A::!, , , , ,v, , , $a''!(()[;O-OP   "-/C!C DggO./1B_DUVV
 
 rB   c                    |j         j        }|j         j        }|j         j        }|d                                         |dz  z  }||z  }t          j        ||z            }||z  }	||	fS )Nr6   )rS   r   r]   r\   prodr_   ro   )
rK   r7   ra   r   r]   r\   r   num_patches_tiledwn_num_patches_tilenum_tiles_tokenss
             rC   r   z)Lfm2VLProcessingInfo.get_num_image_tokens#  s     -7	%5G&6I-b16688=NPQ=QR$(::#y)9<M)MNN/2FF#%555rB   )r:   r;   r<   rI   rM   objectr   rT   r   strr^   rW   r   rd   floatboolrk   tuplers   listr   r   r
   r   r   r?   r@   r   r   rA   rB   rC   rE   rE   P   s       4 4 4D D D?F ?7O ? ? ? ?cDj)A    29 2 2 2 2

 
 	

  
 
 $
 

 
 
 
,!! ! 	!
 ! !  ! 
sCx! ! ! !F> > >U3PS8_@U > > > >66 6 	6
 6 6 
sCx6 6 6 6"*6*6 *6 #T)	*6
 
sCx*6 *6 *6 *6X  	
 #T) 
   -- - 	-
 #T)- 
- - - -^6 6 #T)	6
 
sCx6 6 6 6 6 6rB   rE   c            	       p    e Zd Zdeeef         defdZ	 ddedeeef         deeef         dz  defdZ	dS )	Lfm2VLDummyInputsBuilder	mm_countsrP   c                 x    |                     dd          }| j                                        }|j        }||z  S )NrV   r   )getinforM   r   )rK   r   
num_imagesra   r   s        rC   get_dummy_textz'Lfm2VLDummyInputsBuilder.get_dummy_text4  s;    ]]7A..
I..00	+Z''rB   Nseq_len
mm_optionsc                     |                     dd          }| j                                        \  }}|r|                     d          nd }d|                     ||||          iS )NrV   r   )rY   rZ   r   	overrides)r   r   rd   _get_dummy_images)rK   r   r   r   r   target_widthtarget_heightimage_overridess           rC   get_dummy_mm_dataz*Lfm2VLDummyInputsBuilder.get_dummy_mm_data:  s|     ]]7A..
&*i&Q&Q&S&S#m5?I*..111T T++"$%)	 ,  
 	
rB   rG   )
r:   r;   r<   r   r   r^   r   r   r   r   rA   rB   rC   r   r   3  s        (S(9 (c ( ( ( ( =A	
 

 38$
 C!112T9	

 

 
 
 
 
 
rB   r   c            
            e Zd Zdedeeef         deeef         deeef         def
 fdZdedeeef         deeef         fd	Z	d
e
deeef         dedee         fdZ xZS )Lfm2VLMultiModalProcessorpromptmm_data	mm_kwargs
tok_kwargsrP   c                    
 |                     dg           x}sa j                                                            |          }                     |          }t          t          |g          d          S t                                          ||||          } 	                                
                    d|i                              dt                    fdt          t                              D             }  j        j        d	i |

 fd|D             }	t!          j        |	          |d<   |S )
Nimages)	input_idspt)tensor_typerV   c                 :    g | ]}                     |          S rA   )get_image_size)ry   r   parsed_imagess     rC   r}   z@Lfm2VLMultiModalProcessor._call_hf_processor.<locals>.<listcomp>j  s4     
 
 
01M((++
 
 
rB   c                 ^    g | ])}j                             |j        |j                   *S )r   )r   r   rY   rZ   )ry   sizehf_processorrK   s     rC   r}   z@Lfm2VLMultiModalProcessor._call_hf_processor.<locals>.<listcomp>o  sO     
 
 
  I%% J![& &  
 
 
rB   r9   rA   )r   r   get_tokenizerencode_apply_hf_processor_tokens_onlyr   dictsuper_call_hf_processor_get_data_parserparse_mm_data	get_itemsr   rx   lenrM   r?   tensor)rK   r   r   r   r   r   
prompt_idsprocessed_outputsimage_sizesr9   r   r   	__class__s   `         @@rC   r   z,Lfm2VLMultiModalProcessor._call_hf_processorQ  sv    "++h333 	P002299&AAJ==jIIJ
| < < <$OOOO!GG66	
 
 !!##]GV,--Yw 344 	

 
 
 
5:3};M;M5N5N
 
 
 2ty1>>I>>
 
 
 
 
 $
 
 
 ,1<+D+D-(  rB   	hf_inputshf_processor_mm_kwargsc                    |                     dt          j        d                    }t          t          t
          f         t          j        d|          t          j        d|d          t          j        dd                    S )Nr9   r   rV   T)keep_on_cpu)r1   r7   r9   )r   r?   emptyr   r   r   flat_from_sizesbatched)rK   r   r   r9   s       rC   _get_mm_fields_configz/Lfm2VLMultiModalProcessor._get_mm_fields_config{  s    
  mmM5;q>>BBC../.>wTT0@$   .5g4PPP
 
 
 	
rB   mm_itemsout_mm_kwargsc                        j         j        di |j        dt          f fd}t	          d|          gS )Nitem_idxc                 N                        dt                    }|                    |           }	d         |          }|d         j        }t	          |t
          j                  sJ 
j                            |j	        |j
        |          }t          j        |          S )NrV   r7   )r   r   r7   ra   )
embed_text)r   r   r   data
isinstancer?   r@   r   r   rY   rZ   r    select_text)r   r   
image_sizeout_itemr7   
image_replr   r   r   r   rK   s         rC   get_image_replacement_lfm2vlzSLfm2VLMultiModalProcessor._get_prompt_updates.<locals>.get_image_replacement_lfm2vl  s    ''1DEEF..x88J$W-h7H%&67<Nnel;;;;;11&,'.-&	 2  J '2&   rB   rV   )modalitytargetreplacementrA   )r   rM   r   r^   r   )rK   r   r   r   r  r   r   s   `` ` @@rC   _get_prompt_updatesz-Lfm2VLMultiModalProcessor._get_prompt_updates  s     2ty1KK4JKK".	3 	 	 	 	 	 	 	 	 	 	$  "8  
 	
rB   )r:   r;   r<   r   r   r   r   r   r   r   r   r   r   r   r  __classcell__r   s   @rC   r   r   P  s       (!(! f%(! 3;'	(!
 CK((! 
(! (! (! (! (! (!T

 !(V 4
 
++	,	
 
 
 
 
% 
 !(V 4 
 -	 

 
#	$ 
  
  
  
  
  
  
  
rB   r   c                   b     e Zd Z	 ddededef fdZdej        dej        d	ej        fd
Z	 xZ
S )Lfm2VLMultiModalProjectorFr   configuse_data_parallelprefixc                    t                                                       || _        |j        j        |j        dz  z  }|j        | _        |j        | _        | j        rt          j	        |          | _
        t          j        ||j        |j                  | _        t          |j                 | _        t          j        |j        |j        j        |j                  | _        d S )Nr6   )bias)r   __init__r  vision_confighidden_sizer]   factorprojector_use_layernormnn	LayerNorm
layer_normLinearprojector_hidden_sizeprojector_biaslinear_1r	   projector_hidden_actacttext_configlinear_2)rK   r  r  r  in_channelsr   s        rC   r  z"Lfm2VLMultiModalProjector.__init__  s     	!2*6&:RTU:UV.'-'E$' 	8 l;77DO	(&
 
 

 &56	(*&
 
 
rB   vision_features_packedr7   rP   c           
         |j         j        dk    s
J d            | j        }|j         }|j        d         }|                                }d |D             }g }d}	t          j        |t
          j                  }
t          j        |t
          j                  }t          j        |
|d          \  }}|	                    d          }|	                    d          }t          ||          D ]5\  \  }}}|dk    r||z  dk    s	||z  dk    rt          d	| d
| d| d          ||z  }||z  }t          j        |t
          j                  }t          j        |t
          j                  }t          j        ||d          \  }}|	                    d          }|	                    d          }|dddf         |z  |dddf         z   |z  |dddf         |z  |dddf         z   z   }|                    |	                    d          |	z              |	|z  }	7|r[t          j        |                              |          }|                    d|          }|	                    d||z  |z            }n|                    d||z  |z  f          }| j        r|                     |          }|                     |          }|                     |          }|                     |          }|S )aY  Project packed vision features without materializing padded tensors.

        Args:
            vision_features_packed: (total_tokens, hidden_size) packed in tile order.
            spatial_shapes: (num_tiles, 2) on CPU (height, width) per tile.

        Returns:
            projected_packed: (total_projected_tokens, text_hidden_size)
        cpuYExpected `spatial_shapes` on CPU to avoid device-to-host sync in variable-length packing.r   c                     g | ]
\  }}||z  S rA   rA   ry   r|   r{   s      rC   r}   z5Lfm2VLMultiModalProjector.forward.<locals>.<listcomp>       >>>$!QA>>>rB   r   dtypeij)indexing<spatial_shapes must be divisible by downsample_factor: got (, ) with factor=.N)device)r1  r2   r  shapetolistr?   arangeint64meshgridreshapezip
ValueErrorr   cattoindex_select	new_emptyr  r  r  r  r   )rK   r"  r7   r  r1  r  spatial_shapes_listlengths_listgather_idx_partsoffsetdhdwdh_griddw_griddh_flatdw_flatrZ   rY   length
height_out	width_outrows_outcols_outrrcc	token_idx
gather_idxgathered
unshuffledhidden_statesprojected_packeds                                  rC   forwardz!Lfm2VLMultiModalProjector.forward  sY    $)U222' 322 '.,226/=/D/D/F/F>>*=>>>/1\&444\&444 >"b4@@@//"%%//"%%'*+>'M'M 	 	#OVUV{{!##uv~':': E"E E&+E E;AE E E    6)JI|JekBBBH|IU[AAAH^HhFFFFBBBBBAAAtGv-aaa0@@EI111d7f$wtQQQw'77I ##I$5$5b$9$9F$BCCCfFF 	#34477v7FFJ-::1jIIH!))"fvo.KLLJJ/99FVOk12 J ' 	544Jj11//==77rB   )Fr   )r:   r;   r<   r   r   r   r  r?   r@   rU  r  r	  s   @rC   r  r    s        SU
 
"
7;
MP
 
 
 
 
 
.G  %G  G  
	G  G  G  G  G  G  G  G rB   r  )r   dummy_inputsc                   Z    e Zd ZdZ eddddd          Zeded	ed
edz  fd            Z	eddd
e
ej        df         fd            Zeddd
e
e
eef                  fd            Zdddedef fdZded
edz  fdZdej        dej        d
ej        fdZded
ej        eej                 z  fdZded
efdZ	 	 d(dej        dej        d edz  d!ej        dz  ded
ej        ez  fd"Zd#ej        d
ej        dz  fd$Zd%ee
eej        f                  d
ee         fd&Z d
e!fd'Z" xZ#S ))Lfm2VLForConditionalGenerationTzlanguage_model.lm_head.zlanguage_model.model.zvision_tower.zmulti_modal_projector.)zlm_head.zmodel.language_model.zmodel.vision_tower.zmodel.multi_modal_projector.)orig_to_new_prefixr  r   rP   Nc                 N    |                     d          rdS t          d          )NrV   z<image>z Only image modality is supported)
startswithr9  )clsr  r   s      rC   get_placeholder_strz2Lfm2VLForConditionalGeneration.get_placeholder_str"  s,    w'' 	9;<<<rB   vllm_configr   .c                 T    t          j        |j        j        |j        j                  S rG   )r   short_conv_state_dtypemodel_configr*  cache_configmamba_cache_dtype)r\  r^  s     rC   !get_mamba_state_dtype_from_configz@Lfm2VLForConditionalGeneration.get_mamba_state_dtype_from_config)  s+    
 )?$*$6
 
 	
rB   c                 ~    |j         }|j        j        j        }t	          j        |j        |j        |j                  S )zCalculate shapes for LFM2's convolutional cache.

        Args:
            vllm_config: vLLM config

        Returns:
            Tuple containing:
            - conv_state_shape: Shape for convolutional state cache
        )tp_world_sizeintermediate_sizeconv_kernel)	parallel_configra  	hf_configr  r   short_conv_state_shapetensor_parallel_sizer  conv_L_cache)r\  r^  ri  hf_language_configs       rC   !get_mamba_state_shape_from_configz@Lfm2VLForConditionalGeneration.get_mamba_state_shape_from_config3  sG     &5(5?K(?)>0<*7
 
 
 	
rB   model)r  r  c          	         t                                                       |j        j        }|j        j        }|j        }|j        }|| _        || _        || _        |j	        dk    | _
        |                     |d          5  |j        dk    r&t          ||t          |d                    | _        nt!          d|j                   t#          || j
        t          |d                    | _        d d d            n# 1 swxY w Y   |                     |          5  t)          ||j        t          |d	          |j        j        
          | _        d d d            n# 1 swxY w Y   | j        j        | _        d S )Nr   rV   siglip2_vision_modelvision_tower)r  quant_configr  z#Unsupported visual tokenizer type: multi_modal_projector)r  r  r  language)r^  rj  r  architectures)r   r  ra  rj  multimodal_configr  rt  r  r^  mm_encoder_tp_moder  _mark_tower_model
model_typer*   r.   rs  r9  r  ru  _mark_language_modelr-   r  rw  language_modelmake_empty_intermediate_tensors)rK   r^  r  r  rx  r  rt  r   s          rC   r  z'Lfm2VLForConditionalGeneration.__init__J  s   *7A'4F,"/&!2!2!E!O##K99 	 	'+AAA$0(!-'??% % %!! !T-:RTT   *C"&"8#F,CDD* * *D&	 	 	 	 	 	 	 	 	 	 	 	 	 	 	$ &&{33 	 	"<' ,#FJ77$0>	# # #D	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 ? 	,,,s%   A3DD	D"6E$$E(+E(rO   c                     |                     dd           }|                     dd           }|                     dd           }|d S t          d|||          S )Nr1   r7   r9   )r2   r1   r7   r9   )popLFM2VLImageInputs)rK   rO   r1   r7   r9   s        rC   _parse_and_validate_image_inputz>Lfm2VLForConditionalGeneration._parse_and_validate_image_inputt  sl     zz.$77$4d;;jj554 %)#	
 
 
 	
rB   r1   r7   c           
         |j         j        dk    s
J d            |                    | j        j        j        j        j        j                  }|	                                }d |D             }t          t          |                    }|d d df         |d d df         z                      t          j                  }|                                r'|                                                    d          n t          j        dgt          j                  }|dk    rg S |                    ||j        d         f          }d}	t)          |          D ];\  }
}|dk    r||	|	|z                                ||
d |f                    |	|z  }	<|                    d          }t          j        |t          j        |j                   }t          j        |j        d         dz   t          j        |j                   }t          j        |d	          |dd <   t3          d | j                  5  |                     ||||
          }d d d            n# 1 swxY w Y   t7          |d|          }|d         }| j        j        }g }t=          ||          D ]m\  \  }}}|dk    r|                    d           $||z  dk    s	||z  dk    rtA          d| d| d| d          |                    ||z  ||z  z             n|                     ||          }g }d}	|D ]'}|                    ||	|	|z                       |	|z  }	(|S )Nr$  r%  r)  c                     g | ]
\  }}||z  S rA   rA   r'  s      rC   r}   zKLfm2VLForConditionalGeneration.image_pixels_to_features.<locals>.<listcomp>  r(  rB   r   r$   r   )r*  r1  dim)pixel_values_packedr7   
cu_seqlens
max_seqlenlast_hidden_stater-  r.  r/  r0  )r"  r7   )!r1  r2   r;  rs  vision_model
embeddingspatch_embeddingweightr*  r3  r^   sumr?   int32numelrg   r7  r   r=  r2  	enumeratecopy_	unsqueezezeroscumsumr   r^  getattrru  r  r8  r   r9  )rK   r1   r7   r>  r?  total_tokenslengths_cpur  packed_pixel_valuesrA  r   rH  lengthsr  vision_outputsimage_outputs_packedr"  r  projected_lengths_listrZ   rY   rT  image_featuresout_lens                           rC   image_pixels_to_featuresz7Lfm2VLForConditionalGeneration.image_pixels_to_features  s]   
 $)U222' 322
 $#0;KRX ' 
 
 0>/D/D/F/F>>*=>>>3|,,--%aaad+nQQQT.BBFF+ G 
 

   ""6KOO%%a(((qc555 	 1I*44<-b12
 
 "<00 	 	IAv{{& 89??QZ(   fFF1;;A>>,L4G
 
 
 [M!q +&
 
 


 g1555
122 t'788 	 	!..$7-%%	 /  N	 	 	 	 	 	 	 	 	 	 	 	 	 	 	  '/ 
  
 "6a!8+2,.'*+>'M'M 		R 		R#OVUV{{&--a000!##uv~':': E"E E&+E E;AE E E   #))6V+;*PQQQQ55#9) 6 
 

 .0- 	 	G!!"26FW<L3L"MNNNgFFs   4II!Iimage_inputc                    |d         }|d         }|d         }|                      ||          }|                                }g }d}|D ]=}	||||	z            }
|                    t          j        |
d                     ||	z  }>|S )Nr1   r7   r9   )r7   r   r  )r  r3  r   r?   r:  )rK   r  r1   r7   r9   r  num_patches_listbatched_features	patch_idxcountimage_patchess              rC   _process_image_inputz3Lfm2VLForConditionalGeneration._process_image_input  s     #>2$%56!-066) 7 
 
 '--///1	% 	 	E*9y57H+HIM##EIm$C$C$CDDDIIrB   c                 N     | j         di |}|g S |                     |          S rR   )r  r  )rK   rO   r  s      rC   embed_multimodalz/Lfm2VLForConditionalGeneration.embed_multimodal  s9    :d:DDVDDI((555rB   r   	positionsintermediate_tensorsinputs_embedsc                 @    |d }|                      ||||          }|S )N)r   r  r  r  )r}  )rK   r   r  r  r  rO   rS  s          rC   rU  z&Lfm2VLForConditionalGeneration.forward  s=      + M++!5'	 , 
 
 rB   rS  c                 6    | j                             |          S rG   )r}  compute_logits)rK   rS  s     rC   r  z-Lfm2VLForConditionalGeneration.compute_logits  s     "11-@@@rB   weightsc                 X    t          |           }|                    || j                  S )N)mapper)r+   load_weightshf_to_vllm_mapper)rK   r  loaders      rC   r  z+Lfm2VLForConditionalGeneration.load_weights  s+    "4((""743I"JJJrB   c                 0    t          j        ddd          S )z<
        Get the module prefix in multimodal models
        r}  ru  rs  )r}  	connectortower_model)r   from_string_fieldrJ   s    rC   get_mm_mappingz-Lfm2VLForConditionalGeneration.get_mm_mapping  s'     /+-&
 
 
 	
rB   )NN)$r:   r;   r<   merge_by_field_configr,   r  classmethodr   r^   r]  r   r?   r*  rd  ro  r   r  r   r  r  FloatTensorr@   r  r   r  r&   r  r!   rU  r  r   r   r  r   r  r  r	  s   @rC   rX  rX    s        !%1%<#2,D	
 
   =3 =3 =3: = = = [= 
!
 
u{C	 
 
 
 [
 
!
 
uS#X	
 
 
 [
, BI (
 (
 (
z (
3 (
 (
 (
 (
 (
 (
T

	T	!
 
 
 
 Z'Z Z 
	Z Z Z Zx &  
U\*	*       66 64H 6 6 6 6 <@-1 < < 2D8	
 |d*  
+	+   &A|A 
	A A A AKHU33D-E$F K3s8 K K K K
 
 
 
 
 
 
 
 
rB   rX  )Nr   r_   collections.abcr   r   r   typingr   r   r?   torch.nnr  transformersr   transformers.activationsr	   transformers.models.lfm2_vlr
   1transformers.models.lfm2_vl.configuration_lfm2_vlr   9transformers.models.lfm2_vl.image_processing_lfm2_vl_fastr   r   r   vllm.configr   vllm.config.multimodalr   vllm.forward_contextr   ,vllm.model_executor.layers.mamba.mamba_utilsr   r   )vllm.model_executor.models.module_mappingr   vllm.multimodalr   vllm.multimodal.inputsr   r   r   vllm.multimodal.parser   r   r   vllm.multimodal.processingr   r   r   r   r    vllm.sequencer!   vllm.utils.tensor_schemar"   r#   
interfacesr%   r&   r'   r(   r)   lfm2_siglip2r*   utilsr+   r,   r-   r.   r0   r  rE   r   r   Moduler  register_processorrX  rA   rB   rC   <module>r     sF        7 7 7 7 7 7 7 7 7 7 % % % % % % % %        % % % % % % + + + + + + 7 7 7 7 7 7 J J J J J J          # " " " " " 3 3 3 3 3 3 4 4 4 4 4 4        E D D D D D / / / / / /         
 V U U U U U U U U U              . - - - - - > > > > > > > >              ' & & & & &           ; ; ; ; ;\ ; ; ; + `6 `6 `6 `6 `6- `6 `6 `6F
 
 
 
 
56JK 
 
 
:Z
 Z
 Z
 Z
 Z
 78L M Z
 Z
 Z
z_  _  _  _  _ 	 _  _  _ D ('	)  
S
 S
 S
 S
 S
I!<XS
 S
 
S
 S
 S
rB   