
    .`iw                    d   d dl Z d dlmZmZmZ d dlmZmZ d dlm	Z	m
Z
 d dlmZ d dlmZmZ d dlZd dlZd dlmZ d dlmc mZ d dlmZ d dlmZ d d	lmZmZmZm Z  d d
l!m"Z" d dl#m$Z$ d dl%m&Z&m'Z' d dl(m)Z) d dl*m+Z+ d dl,m-Z-m.Z. d dl/m0Z0m1Z1 d dl2m3Z3m4Z4m5Z5m6Z6m7Z7 d dl8m9Z9 d dl:m;Z;m<Z<m=Z= d dl>m?Z? d dl@mAZA d dlBmCZCmDZDmEZEmFZF d dlGmHZH d dlImJZJ d dlKmLZL d dlMmNZNmOZO d dlPmQZQ d dlRmSZS d dlTmUZU d dlVmWZWmXZXmYZYmZZZ d dl[m\Z\m]Z]m^Z^m_Z_ d d l`maZambZbmcZcmdZdmeZe d d!lfmgZg d d"lhmiZi d d#ljmkZk d d$llmmZmmnZn d%d&lompZpmqZqmrZrmsZsmtZt d%d'lumvZvmwZwmxZxmyZymzZzm{Z{m|Z|m}Z}  e9e~          Zd(Zd)Zd*Z G d+ d,em          Z G d- d.em          Ze G d/ d0                      Ze G d1 d2                      Ze G d3 d4                      Z G d5 d6ej                  Z G d7 d8ej                  Z G d9 d:ej                  Z G d; d<ej                  Z G d= d>ej                  Z G d? d@ej                  Z G dA dBej                  Z G dC dDej        et          Z G dE dFej                  Z G dG dHej                  Z G dI dJej                  Z G dK dLe          Ze+ G dM dNej        et                      ZdOedPedQedRedSedTeeef         fdUZdVedTeeeef                  fdWZdXedYedQedZefd[Zd\e"dTe]fd]Zd^e"dz  dTe"dz  fd_Zd`ej        daedbedceddedeedTeej        ej        f         fdfZdgej        daedhediedTeej        ej        f         f
djZ G dk dl          Zefdmeez  dneez  doeez  dTee         fdpZdmedqedredsedtee         dTedz  fduZdv Z G dw dxeb          Z G dy dzege                   Z G d{ d|eae                   Z eUj        eee}           G d~ dej        ereseqet                      Zdeeeej        f                  dTeeeej        f                  fdZdS )    N)IterableMappingSequence)	dataclassfields)cached_propertypartial)islice)	AnnotatedAny)ImageOps)Image)BatchFeaturePretrainedConfigProcessorMixin
TensorType)
ImageInput)	TextInput)
VideoInputVideoMetadata)	Attention)support_torch_compile)CacheConfig
VllmConfig)BaseDummyOptionsVideoDummyOptions)get_pp_groupget_tensor_model_parallel_rank$get_tensor_model_parallel_world_sizesplit_tensor_along_last_dim tensor_model_parallel_all_gather)init_logger)
MulAndSilu
SiluAndMul
get_act_fn)MMEncoderAttention)RMSNorm)ColumnParallelLinearMergedColumnParallelLinearQKVParallelLinearRowParallelLinear)LogitsProcessor)QuantizationConfig)get_rope)ParallelLMHeadVocabParallelEmbedding)default_weight_loader)MultiModelKeys)MULTIMODAL_REGISTRY)MultiModalDataDictMultiModalFieldConfigMultiModalKwargsItems	VideoItem)ImageProcessorItems	ImageSizeMultiModalDataItemsMultiModalDataParser)BaseMultiModalProcessorBaseProcessingInfoPromptReplacementPromptUpdatePromptUpdateDetails)BaseDummyInputsBuilder)IntermediateTensors
round_down)TensorSchemaTensorShape   )MultiModalEmbeddingsSupportsLoRASupportsMultiModal
SupportsPPSupportsQuant)AutoWeightsLoaderWeightsMapper_merge_multimodal_embeddingsextract_layer_indexis_pp_missing_parameter'make_empty_intermediate_tensors_factorymake_layersmaybe_prefixz	<|image|>z	<|video|>   c                   *   e Zd ZU dZeej         eddd          f         ed<   eej         edd          f         ed<   	 eej         ed	          f         ed
<   eej	         ed          f         ed<   eej         ed	          f         ed<   dS )Molmo2ImageInputsa`  
    Dimensions:
        - nc: The total number of crops (dynamic)
        - np: The total number of patches per crop
        - cps: Number of channels * patch_size * patch_size
        - npp: Number of pooled patches (dynamic)
        - pp: pooling_size * pooling_size
        - ni: Number of images
        - nt: Number of image tokens (dynamic)
    ncnpcpspixel_valuesnpppptoken_poolingninum_pooled_patchesntimage_tokensnum_image_tokensN
__name__
__module____qualname____doc__r   torchTensorrF   __annotations__
BoolTensor     u/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/vllm/model_executor/models/molmo2.pyrW   rW   m   s         	 	 EL++dD%*H*HHIIIIU\;;ud+C+CCDDDD
 "%,D0A0A"ABBBBE,kk$.?.??@@@@kk$.?.? ?@@@@@@rn   rW   c                   *   e Zd ZU dZeej         eddd          f         ed<   eej         edd          f         ed<   	 eej         ed	          f         ed
<   eej	         ed          f         ed<   eej         ed	          f         ed<   dS )Molmo2VideoInputsab  
    Dimensions:
        - nc: The total number of frames (dynamic)
        - np: The total number of patches per frame
        - cps: Number of channels * patch_size * patch_size
        - npp: Number of pooled patches (dynamic)
        - pp: pooling_size * pooling_size
        - nv: Number of videos
        - nt: Number of video tokens (dynamic)
    rX   rY   rZ   pixel_values_videosr\   r]   r^   nvr`   ra   video_tokensnum_video_tokensNrd   rm   rn   ro   rq   rq      s         	 	 #5<T41O1O#OPPPPU\;;ud+C+CCDDDD
 "%,D0A0A"ABBBBE,kk$.?.??@@@@kk$.?.? ?@@@@@@rn   rq   c                       e Zd ZU dZdZeed<   dZeed<   dZeed<   dZ	eed	<   dZ
eed
<   dZeed<   dZeed<   dZeed<   dZeeef         ed<   dZeed<   dZeed<   d Zed             ZdS )	VitConfigzConfig for a vision transformer  hidden_sizei  intermediate_size   num_hidden_layers   num_attention_headsnum_key_value_headsH   head_dimgelu_pytorch_tanh
hidden_actư>layer_norm_eps)z  r   image_default_input_size   image_patch_sizeiA  image_num_posc                 8    t          | j                  | _        d S N)tupler   selfs    ro   __post_init__zVitConfig.__post_init__   s    (-d.K(L(L%%%rn   c                 >    | j         \  }}|| j        z  || j        z  fS r   )r   r   )r   hws      ro   image_num_patchzVitConfig.image_num_patch   s)    ,1D))10E+EEErn   N)re   rf   rg   rh   ry   intrk   rz   r|   r~   r   r   r   strr   floatr   r   r   r   r   propertyr   rm   rn   ro   rw   rw      s        ))K!s!!!s!!!!!!!!Hc)J))) NE   0:eCHo:::cM3M M M F F XF F Frn   rw   c                       e Zd ZU dZdZeeef         ed<   dZe	ed<   dZ
eed<   dZeed	<   dZeed
<   dZeed<   dZeed<   dZeed<   dZeed<   dS )AdapterConfigzConfig for a vit-llm adapter)i
vit_layersFpooling_attention_maskrx   ry   r}   r~   r   r   r   silur    J  rz      text_hidden_sizeN)re   rf   rg   rh   r   r   r   rk   r   boolry   r~   r   r   r   r   rz   r   rm   rn   ro   r   r      s         &&"*Jc3h***#(D(((K!!!!!!!!HcJ"s""" c     rn   r   c                   8   e Zd ZU dZdZeed<   	 dZeed<   	 dZeed<   	 dZ	eed	<   	 d
Z
eed<   	 dZeed<   	 dZeed<   	 dZeed<   	 dZeed<   	 dZeed<   	 dZeed<   	 dZeed<   	 dZeed<   	 dZeed<   	 dZeed<   	 dZeed<   	 d Zeed!f         d z  ed"<   d S )#
TextConfigz*Configuration for a text model transformerr   ry      r~      r      r   i R 
vocab_sizeadditional_vocab_sizeTqkv_bias0   r|   r   rz   r   r   i   max_position_embeddingsg    .A
rope_thetaFuse_qk_normolmoqk_norm_typer   r   
norm_afterN.rope_scaling_layers)re   rf   rg   rh   ry   r   rk   r~   r   r   r   r   r   r   r|   rz   r   r   r   r   r   r   r   r   r   r   r   rm   rn   ro   r   r      s        44K  "!!!  !    Hc J'!$3$$$FHd  s #s""" J $(S''' "J!!! K
 L#
 !NE    JI26sCx4/666 rn   r   c                   l     e Zd ZdZ	 	 ddededededz  ded	df fd
Zdej	        d	ej	        fdZ
 xZS )ViTMLPzMLP used in Vision Transformer.N dim
hidden_dimr   quant_configprefixreturnc                     t                                                       t          ||d|| d          | _        t	          |          | _        t          ||d|| d          | _        d S )NTz.w1biasr   r   z.w2)super__init__r(   w1r%   actr+   w2)r   r   r   r   r   r   	__class__s         ro   r   zViTMLP.__init__$  s     	&%>>>
 
 
 j))#%>>>
 
 
rn   xc                     |                      |          \  }}|                     |          }|                     |          \  }}|S r   )r   r   r   r   r   _s      ro   forwardzViTMLP.forward>  s<    wwqzz1HHQKKwwqzz1rn   Nr   re   rf   rg   rh   r   r   r-   r   ri   rj   r   __classcell__r   s   @ro   r   r   !  s        )) 37
 

 
 	

 )4/
 
 

 
 
 
 
 
4 %,        rn   r   c                   v     e Zd ZdZ	 	 	 ddedededed	ed
edz  deddf fdZde	j
        de	j
        fdZ xZS )ViTMultiHeadDotProductAttentionz0Multi-head attention used in Vision Transformer.TNr   ry   	num_headsr   r   use_biasr   r   r   c           
      X   t                                                       || _        || _        t	                      }| j        | j        z  dk    sJ | j        |z  dk    sJ | j        |z  | _        || _        | j        | j        | j        z  k    sJ || _        | j        |k    r| j        |z  dk    sJ n|| j        z  dk    sJ t          d| j        |z            | _	        | j        | j        z  | _
        | j	        | j        z  | _        t          | j        | j        | j        | j        ||| d          | _        t          | j        | j        z  | j        ||| d          | _        | j        dz  | _        t#          | j        | j        | j        | j	        | d          | _        d S )	Nr   rG   z.merged_qkvr   z.wo      .attn)num_kv_headsr   )r   r   ry   total_num_headsr   r   r   total_num_kv_headsmaxr   q_sizekv_sizer*   
merged_qkvr+   woscaler&   attn)
r   ry   r   r   r   r   r   r   tp_sizer   s
            ro   r   z(ViTMultiHeadDotProductAttention.__init__H  s    	&(688$"66!;;;;#g-2222-8 } 0D4H HHHHH"5"g--*W499999T4499994#:g#EFFnt}4(4=8+M #%)))
 
 
 $ 4=0%>>>
 
 
 ]D(
&NMJ*###
 
 
			rn   inputsc                     |                      |          \  }}|                    | j        | j        | j        gd          \  }}}|                     |||          }|                     |          \  }}|S Nr   )r   splitr   r   r   r   )r   r   qkvr   xqxkxvoutputs           ro   r   z'ViTMultiHeadDotProductAttention.forward  sl    ((QYYT\4<HbYQQ
B2r2&&GGFOO	rn   )TNr   )re   rf   rg   rh   r   r   r-   r   r   ri   rj   r   r   r   s   @ro   r   r   E  s        :: 26:
 :
:
 :
 !	:

 :
 :
 )4/:
 :
 
:
 :
 :
 :
 :
 :
xel u|        rn   r   c            	       d     e Zd ZdZ	 	 ddededz  deddf fdZd	ej	        dej	        fd
Z
 xZS )Molmo2VisionBlockz4Residual attention block used in Vision Transformer.Nr   configr   r   r   c           	         t                                                       t          |j        |j        |j        |j        || d          | _        t          |j        |j	        |j
        || d          | _        t          j        |j        |j                  | _        t          j        |j        |j                  | _        d S )Nz
.attention)ry   r   r   r   r   r   z.feed_forward)r   r   r   r   r   eps)r   r   r   ry   r~   r   r   	attentionr   rz   r   feed_forwardnn	LayerNormr   attention_normffn_normr   r   r   r   r   s       ro   r   zMolmo2VisionBlock.__init__  s     	8*0 & :_%(((
 
 
 #"/(%+++
 
 
 !l%
 
 
 %
 
 
rn   r   c                     ||                      |                     |                    z   }||                     |                     |                    z   }|S r   )r   r   r   r   )r   r   s     ro   r   zMolmo2VisionBlock.forward  sO    t22155666!!$--"2"2333rn   r   )re   rf   rg   rh   rw   r-   r   r   ri   rj   r   r   r   s   @ro   r   r     s        >>
 37	
 

 )4/
 	

 

 
 
 
 
 
> %,        rn   r   c            	       p     e Zd ZdZ	 	 ddededz  deddf fdZd	ej	        de
ej	                 fd
Z xZS )Molmo2VisionBlockCollectionzCCollection of residual attention blocks used in Vision Transformer.Nr   r   r   r   r   c                     t                                                       t          j        fdt	          j                  D                       | _        d S )Nc           	      >    g | ]}t           d |           S )z.resblocks.r   )r   ).0	layer_idxr   r   r   s     ro   
<listcomp>z8Molmo2VisionBlockCollection.__init__.<locals>.<listcomp>  sQ         " $<<<<    rn   )r   r   r   
ModuleListranger|   	resblocksr   s    ```ro   r   z$Molmo2VisionBlockCollection.__init__  sr     	      "'v'?!@!@  	
 	
rn   r   c                 ^    g }| j         D ]"} ||          }|                    |           #|S r   )r  append)r   r   hidden_statesrs       ro   r   z#Molmo2VisionBlockCollection.forward  sB     	$ 	$A!A  ####rn   r   )re   rf   rg   rh   rw   r-   r   r   ri   rj   listr   r   r   s   @ro   r   r     s        MM
 37	
 

 )4/
 	

 

 
 
 
 
 
$ $u|*<        rn   r   c            	            e Zd ZdZ	 	 ddededz  deddf fdZd	ej	        d
e
dej	        fdZ	 dd	ej	        d
e
dz  deej	                 fdZ xZS )Molmo2VisionTransformerz+Vision Transformer used in Vision Backbone.Nr   r   r   r   r   c                    t                                                       |j        dz  }d| _        |j        | _        t          j        t          j	        |j
        |j                  |z            | _        |j        }t          j        ||z  dz  |j        d          | _        t          ||| d          | _        d S )Nr   r      T)r   z.transformerr   )r   r   ry   num_prefix_tokensr   	patch_numr   	Parameterri   randnr   positional_embeddingr   Linearpatch_embeddingr   transformer)r   r   r   r   r   r   r   s         ro   r   z Molmo2VisionTransformer.__init__  s     	"D(&'/$&LK,f.@AAEI%
 %
! "2!y//!3 
  
  

 7***
 
 
rn   r   r  c           	         | j         }|                    t          t          j        |j        d                             t          t          j        |j        d                             |j        d         f          }|\  }}|j        d         |k    s|j        d         |k    rq|                    d                              dddd          }t          j	        |||fddd          }|                    dddd          
                    d          }|                    d	|j        d	                   }||d d d d d f                             |j                  z   }|S )
Nr   rG   r     bicubicFT)sizemodealign_corners	antialiasr   )r  reshaper   mathsqrtshape	unsqueezepermuteFinterpolatesqueezetodtype)r   r   r  pos_embpatch_num_0patch_num_1s         ro   add_pos_embz#Molmo2VisionTransformer.add_pos_emb  sW   +//DIgmA.//00DIgmA.//00a 
 
 &/"k={**gmA.>+.M.M''**221aA>>Gm!;/#  G ooaAq1199!<<G//"gmB&788aaa
#&&qw///rn   c                     || j         }|                     |          }|                     ||          }|                     |          }|S )z>
        : param x: (batch_size, num_patch, n_pixels)
        )r  r  r)  r  )r   r   r  r  s       ro   r   zMolmo2VisionTransformer.forward  sR     I  ##Q	**((++rn   r   r   )re   rf   rg   rh   rw   r-   r   r   ri   rj   r   r)  r  r   r   r   s   @ro   r	  r	    s        55
 37	
 

 )4/
 	

 

 
 
 
 
 
2U\ c el    @ !% < : 
el		       rn   r	  c                        e Zd ZdZ	 	 	 	 ddededed	ed
ededededz  deddf fdZ	 dde	j
        de	j
        de	j
        de	j
        dz  de	j
        f
dZ	 dde	j
        de	j
        de	j
        dz  de	j
        fdZ xZS )ImagePoolingAttentionz+Multi-head attention used for image poolingTFNr   	input_dimry   r   r   r   r   use_pytorch_sdpar   r   r   c
                    t                                                       || _        || _        || _        t                      }
| j        | j        z  dk    sJ | j        |
z  dk    sJ | j        |
z  | _        || _        | j        | j        | j        z  k    sJ || _        | j        |
k    r| j        |
z  dk    sJ n|
| j        z  dk    sJ t          d| j        |
z            | _
        | j
        | j        z  | _        t          | j        | j        | j        z  |||	 d          | _        t          | j        | j        | j        z  gdz  |||	 d          | _        t!          | j        | j        z  | j        |||	 d          | _        | j        dz  | _        || _        |r	d | _        d S t+          | j        | j        | j        | j
        	          | _        d S )
Nr   rG   z.q_projr   r  z
.merged_kvz.o_projr   )r   )r   r   r-  ry   r   r   r   r   r   r   r   r   r(   q_projr)   	merged_kvr+   o_projr   r.  r   r&   )r   r-  ry   r   r   r   r   r.  r   r   r   r   s              ro   r   zImagePoolingAttention.__init__#  s1    	"&(688$"66!;;;;#g-2222-8 } 0D4H HHHHH"5"g--*W499999T4499994#:g#EFF(4=8*N 4=0%%%%
 
 
 4N$t}459%(((
 
 
 ( 4=0%%%%
 
 
 ]D(
 0 	DIII*
!.	  DIIIrn   querykeyvalue	attn_maskc                    |                                 \  }}}|                     d          }|                    ||| j        | j                  }|                    ||| j        | j                  }|                    ||| j        | j                  }| j        | j        k    rHt          j        || j        | j        z  d          }t          j        || j        | j        z  d          }d |||fD             \  }}}t          j        ||||d          	                    dd          }	|	
                    ||d          S )NrG   r  r   c              3   B   K   | ]}|                     d d          V  dS )rG   r  N)	transpose)r   r   s     ro   	<genexpr>z5ImagePoolingAttention.forward_sdpa.<locals>.<genexpr>  s0      LL1Q[[A..LLLLLLrn   F)r6  	is_causalr   )r  viewr   r   r   ri   repeat_interleaver!  scaled_dot_product_attentionr9  r  )
r   r3  r4  r5  r6  bszq_lenr   kv_lenouts
             ro   forward_sdpaz"ImagePoolingAttention.forward_sdpai  sS    

UA!

3t~t}EEhhsFD$5t}EE

3(94=II>T...)$"33  C
 +$"33  E MLU8KLLLsE,
 
 
 )Aq// 	 {{3r***rn   inputs_q	inputs_kvc                 P   |                      |          \  }}|                     |          \  }}|                    | j        | j        gd          \  }}| j        r|                     ||||          }	n|                     |||          }	|                     |	          \  }	}|	S r   )r0  r1  r   r   r.  rC  r   r2  )
r   rD  rE  r6  r   r   kvr   r   r   s
             ro   r   zImagePoolingAttention.forward  s     H%%Ay))A4<6B??B  	+&&r2r9==FFYYr2r**FKK''	rn   )TFNr   r   )re   rf   rg   rh   r   r   r-   r   r   ri   rj   rC  r   r   r   s   @ro   r,  r,     s~       55 !&26D DD D 	D
 !D D D D )4/D D 
D D D D D DV *.$+ $+|$+ \$+ |	$+
 <$&$+ 
$+ $+ $+ $+T *.	 , < <$&	
 
       rn   r,  c                   p     e Zd ZdZ	 	 ddedededededz  d	ed
df fdZdej	        d
ej	        fdZ
 xZS )ImageProjectorMLPz MLP used for the image projectorNr   r-  r   
output_dimr   r   r   r   c                     t                                                       t          ||gdz  d|| d          | _        |dk    sJ t	                      | _        t          ||d|| d          | _        d S )Nr  Fz.merged_linearr   r   z
.down_proj)r   r   r)   merged_linearr$   act_fnr+   	down_proj)r   r-  r   rJ  r   r   r   r   s          ro   r   zImageProjectorMLP.__init__  s     	7L1%,,,
 
 
 V#### ll +%(((
 
 
rn   r   c                     |                      |          \  }}|                     |          }|                     |          \  }}|S r   )rL  rM  rN  r   s      ro   r   zImageProjectorMLP.forward  sB    !!!$$1KKNN~~a  1rn   r   r   r   s   @ro   rI  rI    s        ** 37
 

 
 	

 
 )4/
 
 

 
 
 
 
 
> %,        rn   rI  c                   <    e Zd Zg dddgddgdZ	 	 dd	ed
ededz  deddf
 fdZe	de
j        fd            Ze	de
j        fd            Zde
j        de
j        fdZde
j        de
j        de
j        fdZdeeee
j        f                  dee         fdZ xZS )Molmo2VisionBackbonewqwkwvk_projv_proj	gate_projup_proj)r   r1  rL  Nr   
vit_configadapter_configr   r   r   c                    t                                                       || _        || _        g | _        |j        D ]E}|dk    r| j                            |           #| j                            ||j        z              Ft          | j                  dz   }||j        k     r||_        t          ||| d          | _	        | j	        j
        | _
        |j        t          |j                  z  }t          ||j        |j        |j        |j        |j        || d          | _        t'          |j        |j        |j        |j        || d          | _        d S )	Nr   rG   z
.image_vitr   z.image_pooling_2d)r-  ry   r   r   r   r.  r   r   z.image_projector)r-  r   rJ  r   r   r   )r   r   rZ  r[  r   r  r|   r   r	  	image_vitr  ry   lenr,  r~   r   r   r   image_pooling_2drI  rz   r   r   image_projector)	r   rZ  r[  r   r   layerlast_layer_neededpool_dimr   s	           ro   r   zMolmo2VisionBackbone.__init__  s    	$,#. 	M 	MEzz&&u----&&uz/K'KLLLL0014z;;;+<J(0(((
 
 
 '+n&F)C0I,J,JJ 5&2$8 . B#,+B%///	!
 	!
 	!
  1$0%7%6%0%... 
  
  
rn   c                 .    | j         j        j        j        S r   )r]  r  weightr%  r   s    ro   r%  zMolmo2VisionBackbone.dtype  s    ~-4::rn   c                 .    | j         j        j        j        S r   )r]  r  re  devicer   s    ro   rg  zMolmo2VisionBackbone.device  s    ~-4;;rn   imagesc                 X   |j         \  }}}}|                    ||z  ||          }|                     |          }g }| j        D ]}|                    ||                    t          j        |d          }| j        dk    r|ddddf         }|                    |||d          }|S )zN
        : param images: (batch_size, num_crops, num_patch, n_pixels)
        r   r   r   NrG   )r  r<  r]  r   r  ri   catr  )	r   rh  BTNDimage_featuresfeaturesra  s	            ro   encode_imagez!Molmo2VisionBackbone.encode_image  s     \
1aQUAq))//_ 	3 	3EOON5122228444!A%%+AAAqrrE2N',,Q1b99rn   r^   c                    |j         d d         \  }}|                    | j        | j                  }|                     |          }|j         d         }|dk    }t          j        |d          }t          j        |j         d         t
          j        |j                  }	t          j	        |	
                    |dd          d|j         d         |j         d         g          }	|                    |d|          |	t          j        |d          f         }
|
|                    | j                  d d d d d d d f         z  }
|
                    d|j         d         |g          }
| j        j        r|                    ddd|j         d         g          }|
                    d|
j         d                                                                       d          }t          j        |dk    d|          }|
                    dd	          |d d d d f                             |
j                  z  }nd }|
                    dd	          }|                     ||
|
          }|                    |d|j         d         g          }|                     |          }|
                    d|j         d                   |                                         S )Nr  rg  r%  r   r   r%  rg  rG   T)keepdim)r6  )r  r$  rg  r%  rq  ri   anyarangelongtiler<  r  clipr[  r   r   sumwheremeanr_  r`  flatten)r   rh  r^   
batch_size	num_imagero  r   validvalid_token	batch_idxto_poolr6  denomr3  pooled_featuress                  ro   r   zMolmo2VisionBackbone.forward  s    !'RaR 0
I$+TZ@@**622"2&"ir** L"* '
 
 
	
 JNN:q!,,#A&(;A(>?
 
	 !((R==uz-333
 EHHTZ00AAAqqq$??//2}':2'>"DEE5 		3r1aR&ABBIJJr7=#455;;==AA"EEEK
Au55EKKDK11E!!!T4-4H4K4K5 5 EE ILLTL22E//w)/TT)11_2267
 

 ..??##B(=b(ABB!!
 	
rn   weightsc                 (   g d}t          |                                           }t                      }|D ]\  }}|D ]i\  }}}	||vr|                    ||          }|                    d          r||vr;t          ||           rL||         }
|
j        } ||
||	            nU|                    d          r||vrt          ||           r||         }
t          |
dt                    } ||
|           |	                    |           |S )N))r   rS  q)r   rT  k)r   rU  v)r1  rV  r   )r1  rW  rG   )rL  rX  r   )rL  rY  rG   .biasweight_loader)
dictnamed_parameterssetreplaceendswithrQ   r  getattrr1   add)r   r  stacked_params_mappingparams_dictloaded_paramsnameloaded_weight
param_nameweight_nameshard_idparamr  s               ro   load_weightsz!Molmo2VisionBackbone.load_weightsT  se   	"
 	"
 	"
 4002233"%%%#* 	$ 	$D-5K 4 41
Kd**||K<<==)) d+.E.E*466 #D) % 3e]H=====)) d+.E.E*466 #D) '@U V Ve]333d####rn   r   )re   rf   rg   packed_modules_mappingrw   r   r-   r   r   r   ri   r%  rg  rj   rq  r   r   r   r  r  r   r   s   @ro   rQ  rQ    s       ((()%y1  37/
 /
/
 &/
 )4/	/

 /
 
/
 /
 /
 /
 /
 /
b ;u{ ; ; ; X; < < < < X<5< EL    $4
4
 |4
 
	4
 4
 4
 4
l%HU33D-E$F %3s8 % % % % % % % %rn   rQ  c                        e Zd ZdZ	 	 	 ddedeeef         dedz  de	dz  ded	df fd
Z
dej        dej        d	eej        ej        f         fdZdej        dej        ded	ej        fdZ xZS )Molmo2AttentionzMolmo2's LLM Attention.Nr   r   rope_parameterscache_configr   r   r   c           
         t                                                       |j        | _        t                      | _        |j        | _        | j        | j        z  dk    sJ | j        | j        z  dk    sJ | j        | j        z  | _        |j        | _	        | j	        | j        k    r| j	        | j        z  dk    sJ n| j        | j	        z  dk    sJ t          d| j	        | j        z            | _        |j        | _        | j        | j        z  | _        | j        | j        z  | _        |j        | _        |j        | _        t#          | j        | j        | j        | j	        |j        |          | _        d | _        d | _        d | _        d | _        |j        r|j        dk    r| j        n| j	        | j        z  }t3                      | _        t5          ||j                  | _        |j        dk    r| j        n| j        | j        z  }t5          ||j                  | _        |j        | _        t9          |          }|j        ||j        vr|d         }	d|	d}t=          | j        | j        |	          | _        | j        d
z  | _         tC          | j        | j        | j         | j        ||| d          | _"        tG          | j        | j        z  | j        d|          | _$        d S )Nr   rG   r   r   qwen3r   r   default)	rope_typer   )max_positionr  r   r   )r   r  r   r   F)%r   r   ry   r   r   r~   r   r   r   r   r   r   r   r   r   r   r   r*   r   qkv_projtp_rankk_normq_normr   r   r   r'   r   rP   r   r.   
rotary_embscalingr   r   r+   r2  )r   r   r  r  r   r   k_norm_sizeq_norm_sizer   r   r   s             ro   r   zMolmo2Attention.__init__  s    	!-;==%9$"66!;;;;#dl2a7777-="("<"dl22*T\9Q>>>>><$"99Q>>>>4#:dl#JKKnt}4(4=8'-'E$ + *M #%
 
 
 $((,(,(, 	4 &'11 ,t}< 
 :;;DL!+63HIIIDK &'11 )DM9 
 "+63HIIIDK & 3D'//	&2!;;;(6J,5ZPPO"M5+
 
 

 }d*NML*%%###
 
 
	 ( 4=0%	
 
 
rn   r  r  c                    | j         dk    rBt          |                                          }t          |                                          }|                     |          }|                     |          }| j         dk    rGt          t          | j                   } ||          | j                 } ||          | j                 }||fS )NrG   )num_partitions)r   r!   
contiguousr  r  r	   r    r  )r   r  r  splitters       ro   _apply_qk_normzMolmo2Attention._apply_qk_norm  s    
 <!0@@A0@@AKKNNKKNN<!:4<XXXHDL)ADL)A!trn   	positionsr  kwargsc                 8   |                      |          \  }}|                    | j        | j        | j        gd          \  }}}| j        ,| j        %| j        dk    r|                     ||          \  }}n| j        | j         |j        g |j	        d d         |j	        d         | j
        z  | j
        R  }	|                     |	          }	|	                    |j	                  } |j        g |j	        d d         |j	        d         | j
        z  | j
        R  }
|                     |
          }
|
                    |j	                  }|                     |||          \  }}|                     |||          }|                     |          \  }}|S )Nr   r   r   )r  r   r   r   r  r  r   r  r<  r  r   r  r   r2  )r   r  r  r  r   r   r  r  r  	q_by_head	k_by_headattn_outputr   s                ro   r   zMolmo2Attention.forward  s    }--Q))T[$,E2)NN1aK#'!V++&&q!,,DAqq[$)@ "t},   I
 I..Iqw''A "t},   I
 I..Iqw''Ay!Q//1ii1a((KK,,	rn   NNr   )re   rf   rg   rh   r   r  r   r   r   r-   r   ri   rj   r   r  objectr   r   r   s   @ro   r  r  |  s/       !! ,026Y
 Y
Y
 c3hY
 "D(	Y

 )4/Y
 Y
 
Y
 Y
 Y
 Y
 Y
 Y
v< < 
u|U\)	*	    !<! |! 	!
 
! ! ! ! ! ! ! !rn   r  c                   f     e Zd ZdZ	 ddededededz  ddf
 fdZd	ej	        dej	        fd
Z
 xZS )LanguageModelMLPzMolmo2's LLM mlp.Nr-  rz   r   r   r   c                     t                                                       t          ||gdz  d|          | _        |dk    sJ t	                      | _        t          ||d|          | _        d S )Nr  Fr  r   )r   r   r)   up_gate_projr#   rM  r+   rN  )r   r-  rz   r   r   r   s        ro   r   zLanguageModelMLP.__init__  s     	6!#%	
 
 
 V#### ll*%	
 
 
rn   r   c                     |                      |          \  }}|                     |          }|                     |          \  }}|S r   )r  rM  rN  )r   r   up_gater   s       ro   r   zLanguageModelMLP.forward+  sF     &&q))
KK  ~~a  1rn   r   r   r   s   @ro   r  r    s         37
 

 
 	

 )4/
 

 
 
 
 
 
4< 
       rn   r  c                        e Zd Z	 	 	 ddedeeef         dedz  dedz  deddf fd	Z	d
e
j        de
j        de
j        dz  dedee
j        ee
j        e
j        f         dz  f         f
dZ xZS )Molmo2DecoderLayerNr   r   r  r  r   r   r   c                 L   t                                                       t          ||||| d          | _        t	          |j        |j        |j        |          | _        t          |j        |j
                  | _        t          |j        |j
                  | _        d S )Nz
.self_attnr   r   )r   r   r  	self_attnr  ry   rz   r   mlpr'   r   input_layernormpost_attention_layernorm)r   r   r  r  r   r   r   s         ro   r   zMolmo2DecoderLayer.__init__6  s     	((((
 
 
 $$	
 
  'v'9v?TUUU(/%)
 )
 )
%%%rn   r  r  residualr  c                     ||}|                      |          }n|                      ||          \  }} | j        d||d|}|                     ||          \  }}|                     |          }||fS N)r  r  rm   )r  r  r  r  r   r  r  r  r  s        ro   r   zMolmo2DecoderLayer.forwardW  s     $H 00??MM&*&:&:=(&S&S#M8& 
'
 
 
 
 #'"?"?x"X"Xx//h&&rn   r  )re   rf   rg   r   r  r   r   r   r-   r   ri   rj   r  r   r   r   r   s   @ro   r  r  5  s       
 ,026
 

 c3h
 "D(	

 )4/
 
 

 
 
 
 
 
B'<' |' ,%	'
 ' 
u|U5<#=>EE	F' ' ' ' ' ' ' 'rn   r  c                       e Zd Zdej        dej        dej        dz  dedeej        eej        ej        f         dz  f         f
dZdS )Molmo2DecoderNormAfterLayerr  r  r  Nr  r   c                     |} | j         d||d|}|                     |          }||z   }|}|                     |          }|                     |          }||z   }d }||fS r  )r  r  r  r  r  s        ro   r   z#Molmo2DecoderNormAfterLayer.forwardp  s     !& 
'
 
 
 
 ,,];;%0 //55mDD%0h&&rn   )re   rf   rg   ri   rj   r  r   r   rm   rn   ro   r  r  o  s        '<' |' ,%	'
 ' 
u|U5<#=>EE	F' ' ' ' ' 'rn   r  c                        e Zd Zdddedef fdZdej        dej        fdZ	 	 ddej        d
ej        de	d	z  dej        d	z  de
dej        fdZdeeeej        f                  dee         fdZ xZS )Molmo2TextModelr   r   vllm_configr   c                  	
 t                                                       |j        j        }|j        |j        	|| _        t          |d          r|j        n|j	        i }t          t                    D ]}t          |j                  ||j        <    t          d
i |

j        | _        | xj        
j        pdz  c_        t#          | j        
j        	          | _        
j        rt*          nt,          t/          
j        	
fd| d          \  | _        | _        | _        t9          
j        
j                  | _        t?          dd	g
j                  | _         d S )Ntext_configr   r   c                 .     j         |           S )N)r  r   r   )r  )r   r  decoder_layerhf_text_configr   r  s    ro   <lambda>z*Molmo2TextModel.__init__.<locals>.<lambda>  s*    ==.))   rn   z.layersr   r   r  r  rm   )!r   r   model_config	hf_configr  r   r   hasattrr  
llm_configr   r   r  r  r   embedding_sizer   r0   ry   embed_tokensr   r  r  rS   r|   start_layer	end_layerlayersr'   r   normrR   make_empty_intermediate_tensors)r   r  r   r   r  fieldr  r  r  r   r  r   s         @@@@@ro   r   zMolmo2TextModel.__init__  s   )3"/"/6=)) 	/#/NN#.NJ'' 	E 	EE!(!D!DF5: **6**)4{@EAE2#%
 
 
 %$''# 	
 9D)        %%%
9
 
9
 
9
5$.$+ K39STTT	/Vj)#0
 0
,,,rn   	input_idsr   c                 ,    |                      |          S r   )r  )r   r  s     ro   embed_input_idszMolmo2TextModel.embed_input_ids  s      +++rn   Nr  intermediate_tensorsinputs_embedsr  c                    t                      j        r||}n|                     |          }d }n|J |d         }|d         }t          | j        | j        | j                  D ]} ||||fi |\  }}t                      j        st          ||d          S || 	                    ||          \  }}	n| 	                    |          }|S )Nr  r  )r  r  )
r   is_first_rankr  r
   r  r  r  is_last_rankrB   r  )
r   r  r  r  r  r  r  r  ra  r   s
             ro   r   zMolmo2TextModel.forward  s    >>' 		8( - $ 1 1) < <HH'3330AM+J7H DK)94>JJ 	 	E&+e' ' 	' '#M88 ~~* 	&"/XFF   #yyAAM11 IIm44Mrn   r  c                 H   t          |                                           }t                      }|D ]o\  }}|                    d          r||vrt	          ||           r0||         }t          |dt                    } |||           |                    |           p|S )Nr  r  )r  r  r  r  rQ   r  r1   r  )r   r  r  r  r  r  r  r  s           ro   r  zMolmo2TextModel.load_weights  s    4002233"%%%#* 		$ 		$D-}}W%% $k*A*A&tT22 %E#E?<QRRMM%///d####rn   NN)re   rf   rg   r   r   r   ri   rj   r  rB   r  r   r   r   r  r  r   r   s   @ro   r  r    s%       AC 1
 1
 1
z 1
3 1
 1
 1
 1
 1
 1
f, ,%, , , , , <@-1# #<# <# 2D8	#
 |d*# # 
# # # #JHU33D-E$F 3s8        rn   r  image_himage_w
patch_sizepool_hpool_wr   c                     | |z  }||z  }t          ||z   dz
  |          |z
  }t          ||z   dz
  |          |z
  }||z   |z  }	||z   |z  }
|	|
fS NrG   rC   )r  r  r  r  r  patch_hpatch_wh_padw_padnrowsncolss              ro   get_patches_grid_sizer    sz     #G#Gw'!+V44w>Ew'!+V44w>Eu_'Eu_'E%<rn   max_numc                 f      fdt          d dz             D             }t          |d           S )Nc                 V    g | ]%}t          d d z             D ]}||z  k    ||f&S )rG   )r  )r   ijr  s      ro   r   z)get_candidate_tilings.<locals>.<listcomp>  s[       q'A+&&  q5G 
A rn   rG   c                 4    | d         | d         z  | d         fS )Nr   rG   rm   )r   s    ro   r  z'get_candidate_tilings.<locals>.<lambda>  s    !A$1+qt)< rn   )r4  )r  sorted)r  tilingss   ` ro   get_candidate_tilingsr
  
  sR       q'A+&&  G '<<====rn   heightwidthmax_num_patchesc                    t          |          }t          j        |t          j                  }||z  }t          j        | |gt          j                  }|                    t          j                  |z  }|                    dd          }	|	dk                                     r|	                                }
n,t          j	        |	dk     d|	          
                                }
||
         S )Nr%  r   T)axiskeepdimsrG   g      ?g    _B)r
  rY   arrayint32float32astypeminallargmaxr}  argmin)r  r  r  r  r	  candidate_tilingscandidate_resolutionsoriginal_sizerequired_scale_drequired_scaleixs              ro   select_tilingr     s     $O44G999-
:Hfe_BJ???M,33BJ??-O%))rD)AAN!! K""$$Xns*D.AAHHJJR  rn   imagec                 .   t          | t                    rt          | j         S t          | t          j        t          j        f          r.| j        dk    sJ | j	        \  }}}|dv sJ t          ||          S t          dt          |                      )Nr  )rG   r  zUnknown image type: )
isinstancer   r9   r  rY   ndarrayri   rj   ndimr  
ValueErrortype)r!  r   r   cs       ro   get_image_sizer)  +  s    % ?%*%%	EBJ5	6	6 ?zQ+1aF{{{{A=U==>>>rn   rh  c                     | d S | )t          | t          t          f          rd | D             } n+| )t          | t                    rt	          j        |           } | S )Nc                 Z    g | ](}t          |t                    rt          |          n|)S rm   )r#  r   exif_tranpose)r   imgs     ro   r   z!exif_tranpose.<locals>.<listcomp>=  sA     
 
 
FI*S%"8"8AM#c
 
 
rn   )r#  r  r   r   r   exif_transpose)rh  s    ro   r,  r,  7  ss     ~tj$??
 
MS
 
 
 
	
65 9 9	(00Mrn   image_gridsimage_patch_idlow_res_image_start_idimage_start_idimage_col_idimage_end_idc                    | j         }| j        d         }| d d df         }| d d df         }	| d d df         }
| d d df         }||	z  |
|dz   z  z   dz   }t          |                                                                          }t          j        |t
          j        |          }d}t          |          D ]}| |         	                                \  }}}}t          ||                                                   }||z  }|}|||<   |dz  }|dk    r|||||z   <   ||z  }|||<   |dz  }|||<   |dz  }|dz   }|dk    rc|dk    r]t          j        |t
          j        |          }|dk    r||d |<   |||<   |
                    |          }||||||z  z   <   |||z  z  }|||<   |dz  }||z
  |k    sJ ||z  }||fS )Nr   rG   r  r  r   rt  )rg  r  r   r|  itemri   emptyry  r  tolistrepeat)r/  r0  r1  r2  r3  r4  rg  rk  	resized_h	resized_wr   r   lengths	total_lenflatoffsetr  resized_h_iresized_w_ih_iw_iL_inum_low_res_patchesidx	block_lenlineblocks                              ro   build_flat_image_bool_lengthrJ  E  s>    F!AAAAqD!IAAAqD!IAAAqDAAAAqDA)#a1q5k1A5GGKKMM&&(())I;y
6BBBDF1XX % %-8^-B-B-D-D*[#s'!*//##$$)K7*S	q""4BDs0001&&C S	q"S	q!G	q==S1WW;y
6JJJDQww+TcT
$DIKK$$E05DsS9_,,-3?"C S	qV|s""""#=rn   video_gridsframe_start_idframe_end_idc                    | j         }| j        d         }| d d df         }| d d df         }| d d df         }||z  }	|	dz   }
||
z  }t          |                                                                          }t          j        |t
          j        |          }d}t          |          D ]}t          ||                                                   }t          |	|                                                   }t          ||                                                   }t          j        |dz   t
          j        |          }||d<   |dk    r
||dd|z   <   ||d<   |	                    |          }|||||z   <   ||z  }||fS )Nr   rG   r  rt  r   )
rg  r  r   r|  r6  ri   r7  ry  r  r9  )rK  r0  rL  rM  rg  rk  tr:  r;  PrG  r<  r=  r>  r?  r  tiPiLirI  seqs                        ro   build_flat_video_bool_lengthrU    s    F!AAAAqDAAAAqD!IAAAqD!IIAAI)mGGKKMM&&(())I;y
6BBBDF1XX  11""##BF%*VDDD!a66 .E!a"f* b	ll2%(Vfrk!""=rn   c                   p    e Zd ZdZdedef fdZedee	e
f         fd            Zede
fd            Zede
fd            Zede
fd	            Zede
fd
            Zede
fd            Zedee
e
f         fd            Zede
fd            Zedee
e
f         fd            Zede	fd            Zede
fd            Zede
fd            Zede
fd            Zede
fd            Zede
fd            Zede
fd            Zede
fd            Zede
fd            Zede
fd            Zede
fd            Zede e
         fd            Z!de
de
dee
e
f         fdZ"de#dee
e
f         fdZ$de
de
dee
e
f         fd Z%	 	 	 	 d(d"e&e e&         z  d!z  d#e'd!z  d$e(d!z  d%e	e)z  d&e*de+fd'Z, xZ-S ))Molmo2ProcessorWrapperzK
    Wraps :class:`Molmo2Processor` so that it can be called directly.
    	processorr  c                 d    t                                                       || _        || _        d S r   )r   r   rX  r  )r   rX  r  r   s      ro   r   zMolmo2ProcessorWrapper.__init__  s+    ""rn   r   c                 $    | j         j        j        S r   )rX  	tokenizervocabr   s    ro   r\  zMolmo2ProcessorWrapper.vocab  s    ~'--rn   c                 Z    | j         j        }|j        }t          |t                    sJ |S r   )rX  image_processor	max_cropsr#  r   )r   r^  r_  s      ro   r_  z Molmo2ProcessorWrapper.max_crops  s0    .8#-	)S)))))rn   c                 f    | j         j        }|j        d         }t          |t                    sJ |S Nr   rX  r^  pooling_sizer#  r   )r   r^  image_pooling_hs      ro   rd  z&Molmo2ProcessorWrapper.image_pooling_h  5    .8)6q9/3/////rn   c                 f    | j         j        }|j        d         }t          |t                    sJ |S r  rb  )r   r^  image_pooling_ws      ro   rg  z&Molmo2ProcessorWrapper.image_pooling_w  re  rn   c                 f    | j         j        }|j        d         }t          |t                    sJ |S ra  rX  video_processorrc  r#  r   )r   rj  video_pooling_hs      ro   rk  z&Molmo2ProcessorWrapper.video_pooling_h  re  rn   c                 f    | j         j        }|j        d         }t          |t                    sJ |S r  ri  )r   rj  video_pooling_ws      ro   rm  z&Molmo2ProcessorWrapper.video_pooling_w  re  rn   c                     t          | j        dd           | j        j        }n| j        j        }|j        d         |j        d         f}|S )Nr^  r  r  )r  rX  r^  rj  r  )r   rX  base_image_input_sizes      ro   ro  z,Molmo2ProcessorWrapper.base_image_input_size  sL    4>#4d;;G6II6I!*!99>';R S$$rn   c                     t          | j        dd           | j        j        }n| j        j        }|j        }t          |t                    sJ |S )Nr^  )r  rX  r^  rj  r  r#  r   )r   rX  r   s      ro   r   z'Molmo2ProcessorWrapper.image_patch_size  sS    4>#4d;;G6II6I$/*C00000rn   c                     | j         j        }|j        \  }}t          |t                    sJ t          |t                    sJ ||fS r   )rX  r^  overlap_marginsr#  r   )r   r^  left_marginright_margins       ro   rr  z&Molmo2ProcessorWrapper.overlap_margins   sP    .8$3$C!\+s+++++,,,,,,L((rn   c                 F    | j         j        j        p| j         j        j        S r   )rX  r[  	bos_token	eos_tokenr   s    ro   rv  z Molmo2ProcessorWrapper.bos_token
  s    ~'1WT^5M5WWrn   c                     | j         j        S r   )r  r0  r   s    ro   r0  z%Molmo2ProcessorWrapper.image_patch_id  s    ~,,rn   c                     | j         j        S r   )r  r3  r   s    ro   	im_col_idz Molmo2ProcessorWrapper.im_col_id  s    ~**rn   c                     | j         j        S r   )r  image_start_token_idr   s    ro   im_start_idz"Molmo2ProcessorWrapper.im_start_id      ~22rn   c                     | j         j        S r   )r  image_end_token_idr   s    ro   	im_end_idz Molmo2ProcessorWrapper.im_end_id      ~00rn   c                     | j         j        S r   )r  low_res_image_start_token_idr   s    ro   low_res_im_start_idz*Molmo2ProcessorWrapper.low_res_im_start_id  s    ~::rn   c                     | j         j        S r   )r  frame_start_token_idr   s    ro   rL  z%Molmo2ProcessorWrapper.frame_start_id"  r~  rn   c                     | j         j        S r   )r  frame_end_token_idr   s    ro   rM  z#Molmo2ProcessorWrapper.frame_end_id&  r  rn   c                     | j         j        S r   )r  image_low_res_idr   s    ro   im_low_res_idz$Molmo2ProcessorWrapper.im_low_res_id*  s    ~..rn   c                 &    | j         t                   S r   )r\  IMAGE_PROMPTr   s    ro   image_placeholder_idz+Molmo2ProcessorWrapper.image_placeholder_id.      z,''rn   c                 &    | j         t                   S r   )r\  VIDEO_PROMPTr   s    ro   video_placeholder_idz+Molmo2ProcessorWrapper.video_placeholder_id2  r  rn   c                 f    | j         | j        | j        | j        | j        | j        | j        | j        gS r   )r0  rz  r}  r  rL  r  rM  r  r   s    ro   image_token_idsz&Molmo2ProcessorWrapper.image_token_ids6  s<     N$N	
 		
rn   image_heightimage_widthc                    | j         }| j        \  }}| j        }| j        }|||z   z  }|d         |z  }	|	||z   z
  }
|
|z  }t	          ||z
  ||z
  ||          \  }}||fS )Nr   )r  r  r  r  )r_  rr  ro  r   r   )r   r  r  r_  rs  rt  ro  base_image_input_dtotal_margin_pixelscrop_patchescrop_window_patchescrop_window_sizetiling_htiling_ws                 ro   r   z$Molmo2ProcessorWrapper.select_tilingC  s     N	$($8!\ $ :!20L;4NO,Q/3EE*l[.HI.1CC*"55 33'%	
 
 
( !!rn   is_videoc                     | j         }t          |d         |d         | j        |r| j        n| j        |r| j        n| j                  S )Nr   rG   r  r  r  r  r  )ro  r  r   rk  rd  rm  rg  )r   r  ro  s      ro   get_base_grid_sizez)Molmo2ProcessorWrapper.get_base_grid_size[  s^     $ :$)!,)!,,+3M4''9M+3M4''9M
 
 
 	
rn   c                   | j         \  }}| j        }| j        }|||z   z  }|d         |z  }|||z   z
  }	|	|z  }
|                     ||          \  }}||
z  |z   ||
z  |z   g\  }}t	          |||| j        | j                  \  }}||fS )Nr   r  r  r  )rr  ro  r   r   r  rd  rg  )r   r  r  rs  rt  ro  r  r  r  r  r  r  r  r   r   r  r   s                    ro   r  z,Molmo2ProcessorWrapper.get_patches_grid_sizef  s     %)$8!\ $ :!20L;4NO,Q/3EE*l[.HI.1CC!//%# 0 
 
( ''*==''*==
1 -)''
 
 
u e|rn   Ntextrh  videosreturn_tensorsr  c                 T   |g}t          |          }t          | j        dd           |                    |           t          | j        dd           |                    |            | j        |d|i|}|d         d         | j        | j                 k    r|d         d d dd f         |d<   |g }t          |t                    s|g}|g }t          |t                    s|g}t          |          dv s
J d            |	                    d	          }|	                    d
d           }	t          |          dk    rg }
|D ]\}t          |          }|                     |j        |j                  }|
                    t          j        |          dz              ]t!          |
          t          |d                   k    sJ t!          |
          |d                                                                         k    sJ |	                    d          }|d d d df                             d          |d d dd f                             d          z   }||d<   |d         j        d         }|d         |z  |d<   t'          || j        | j        | j        | j        | j                  \  }}||d<   ||d<   t          |          dk    r|	                    d          }|d d df                                         t          |d                   k    sJ |d d df         |d<   |                    d          |d<   |d         j        d         }|d         |z  |d<   t3          || j        | j        | j                  \  }}||d<   ||d<   t9          |          S )Nr^  rj  r  r  r   r   rG   >   r   rG   z)At most one video is supported for Molmo2attention_masktoken_type_idsr   r  r[   image_num_cropsr/  r  r   image_num_pooled_patchesimage_num_patchesrb   rc   rK  rr   video_num_cropsvideo_num_pooled_patchesvideo_num_patchesrt   ru   )r,  r  rX  r  r\  rv  r#  r  r^  popr)  r   r  r  rY   prodr|  r6  r  rJ  r0  r  r}  rz  r  rU  rL  rM  r   )r   r  rh  r  r  r  r   outputs_attention_mask_token_type_ids	num_cropsr!  
image_sizetilingr/  r  	n_patchesrb   rc   rK  rt   ru   s                         ro   __call__zMolmo2ProcessorWrapper.__call__  s    v&&4>#4d;;GMM&!!!4>#4d;;GMM&!!! $.
)
 
 
 ;%DN)CCC#*;#7122#>GK >F&$'' 	XF>F&$'' 	XF6{{f$$$&Q$$$(/4D(E(E(/4Dd(K(Kv;;??I 6 6+E22
++!+!2 * 0 ,     1!45555y>>S)@%A%AAAAAy>>W->%?%C%C%E%E%J%J%L%LLLLL(/M(B(BK5@BQB5G5L5L 6M 6 6AAAqrrE"''A'..6/$ 3KG.//5a8I+23D+E	+QG'(-I#( . .*L* '3GN#*:G&'v;;??(/M(B(BKqqq!t$((**c':O2P.Q.QQQQQ)4QQQT):G%&2=2B2Bq2B2I2IG./ 56<Q?I+23D+E	+QG'(-I##!	. .*L* '3GN#*:G&'G$$$rn   )NNNN).re   rf   rg   rh   r   r   r   r   r  r   r   r\  r_  rd  rg  rk  rm  r   ro  r   rr  rv  r0  rz  r}  r  r  rL  rM  r  r  r  r  r  r   r   r  r  r   r   r   r   r  r   r  r   r   s   @ro   rW  rW    s        #. #=M # # # # # # .tCH~ . . . _. 3    _     _     _     _     _ %uS#X % % % _% 	 # 	  	  	  _	  )sCx ) ) ) _) X3 X X X _X - - - - _- +3 + + + _+ 3S 3 3 3 _3 13 1 1 1 _1 ;S ; ; ; _; 3 3 3 3 _3 1c 1 1 1 _1 /s / / / _/ (c ( ( ( _( (c ( ( ( _( 

c 

 

 

 _

" " 	"
 
sCx" " " "0	
4 	
E#s(O 	
 	
 	
 	
    	 
 
sCx       H 48$($(+/V% V%$y/)D0V% T!V% T!	V%
 j(V% V% 
V% V% V% V% V% V% V% V%rn   rW  	video_fpssampling_fpsmax_fpsc                    t          |           } t          |          }t          |          }|t          d          | dk    s|dk    rt          d|  d| d          | |z  dk    rt          d| d|  d	          g }t          || d
z   |          D ]5}||k    r n,| |z  dk    r"|                    t	          |                     6|S )aE  
    Return the subset of `video_fps` factors that remain multiples
    of `sampling_fps`.

    Examples:
        >>> get_candidate_target_fps(video_fps=6, sampling_fps=2)
        [2, 6]
        >>> get_candidate_target_fps(video_fps=5, sampling_fps=1)
        [1, 5]
        >>> get_candidate_target_fps(video_fps=2, sampling_fps=2)
        [2]
        >>> get_candidate_target_fps(video_fps=5, sampling_fps=2)
        Traceback (most recent call last):
            ...
        ValueError: sampling_fps=2 must divide video_fps=5 to produce
            consistent frame steps.
    Nzsampling_fps must be providedr   z1video_fps and sampling_fps must be positive (got z, )zsampling_fps=z must divide video_fps=.rG   )r   r&  r  r  r   )r  r  r  
candidates	candidates        ro   get_candidate_target_fpsr    s(   , II|$$L'llG8999A~~**11 1!-1 1 1
 
 	
 <1$$MLMMMMM
 
 	
 J<QEE 0 0	wEy A%%eI..///rn   
max_framestotal_framesframe_sample_modecandidate_target_fpsc                     d}d}|D ]d}t          t          | |z            d          }t          ||z            }	|dk    rd|v r|	|k    r n|}|	}K||	k    sJ |	|k    rZ|	|k    r|}|	}e|S )zV
    Get the target fps that best spans the video and has the most frames sampled
    r   NrG   uniform)r   r   )
r  r  r  r  r  num_frames_sampledselected_target_fps
target_fps	step_sizenum_frames_sampled_at_fpss
             ro   get_target_fpsr    s     * ? ?
I
233Q77	$'y(@$A$A!""...-
::",!: &)BBBBB(:55*-??? '1#%>"rn   c                     | t          j        d||dt                    }n6t          t          || z            d          }t          j        d||          }t          |          |k    r
|d |         }| |fS )Nr   F)endpointr%  rG   )rY   linspacer   r   rx  r^  )r  r  r  r  frame_indicesr  s         ro   get_frame_times_and_chosen_fpsr  8  s     "|Z%s
 
 
 I(;;<<a@@		!\9==
=J&&%kzk2--rn   c                   0   e Zd ZdedefdZdeeedz  f         fdZ	dddeded	edz  defd
Z
ddded	edz  defdZdefdZdedefdZdedeeef         defdZdedededededededej        fdZ	 ddeeef         dedz  dee         fdZdS )Molmo2ProcessingInfor  r   c                 x     | j         j        di |}| j                                         }t          ||          S )Nrm   )ctxget_hf_processorget_hf_configrW  )r   r  rX  r  s       ro   r  z%Molmo2ProcessingInfo.get_hf_processorH  s?    -DH-7777	H**,,	%i;;;rn   Nc                     d ddS )NrG   r!  videorm   r   s    ro   get_supported_mm_limitsz,Molmo2ProcessingInfo.get_supported_mm_limitsM  s    ***rn   )rX  r  r  rX  c                B   ||                                  }|j        }|                    d          \  }}|j        |j        }n|j        }d||t          |          z   z  z   }|                    ||          \  }	}
d|	|
t          |j                  z   z  z   }||z   S )NFr  r  r  )r  rX  r  use_single_crop_col_tokensimage_use_col_tokensr   r  )r   r  r  rX  hf_processorresize_nrowsresize_colsuse_col_tokensextraoverlap_nrowsoverlap_ncolsjoints               ro   get_num_image_tokensz)Molmo2ProcessingInfo.get_num_image_tokensP  s     --//I *$-$@$@%$@$P$P!k2>)DNN)>NLK#n2E2E$EFF'0'F'F%# (G (
 (
$} MC ABBB
 
 u}rn   
num_framesc                    ||                                  }|                    d          \  }}d||t          |j        j                  z   z  z   }||z  S )NTr  r  )r  r  r   rX  video_use_col_tokens)r   r  rX  r  r  r  s         ro   get_num_video_tokensz)Molmo2ProcessingInfo.get_num_video_tokensm  sj     --//I$-$@$@$$@$O$O!kL#i1FGGG
 
 E!!rn   c                    |                                  }|j        \  }}|j        }|j        }|||z   z  }|d         |z  }|||z   z
  }||z  }	t	          |j                  }
d\  }}|
D ]F\  }}||	z  |z   }||	z  |z   }|                     |||          }||k    r|}t          ||          }G|dk    s|t          d          |S )Nr   )r   N)r  r  rX  )r  r  z(Cannot have a largest feature size of 0!)	r  rr  ro  r   r
  r_  r  r9   r&  )r   rX  rs  rt  ro  r  r  r  r  r  r	  largest_feature_sizelargest_feature_pinpointhrwrr  r  	feat_sizes                     ro   !get_image_size_with_most_featuresz6Molmo2ProcessingInfo.get_image_size_with_most_features}  s0   ))++	$-$=!\ ) ?&70L;4NO,Q/3EE*l[.HI.1CC'	(;<<9@66 		Q 		QFB**-@@F)),??E11#) 2  I ///'0$+45+P+P+P(1$$(@(HGHHH''rn   
max_tokensc                 X    |                      d          }||z  }t          |d          S )NrG   )r  )r  r   )r   r  num_tokens_per_framer  s       ro   _get_max_video_framesz*Molmo2ProcessingInfo._get_max_video_frames  s4    #88A8FF#77
:q!!!rn   seq_len	mm_countsc                    |                                  j        j        }|j        }|                    dd          }|                     |          }t          |t          |d          z  |          }t          |d          S )Nr  r   rG   )r  rX  rj  r  getr  r  r   )r   r  r  rj  r  
max_videosmax_total_framesmax_frames_per_videos           ro   !get_num_frames_with_most_featuresz6Molmo2ProcessingInfo.get_num_frames_with_most_features  s    
 //11;K$/
]]7A..
55g>>"J 2 22 
  
 '+++rn   total_num_framesr  durationr  r  r  c                    |dk    r=|:|dk    r.t          j        |                              t                    }n||dz
  |z  k    rCt          j        d|dz
  t          ||          d                              t                    }nMt          j        d|dz
  t          ||z                      }	t          j        |	d	                   |dz
  k    rt          j        |	|dz
  ggd
          }	t          j        |	                              t                    }|d	         |k     sJ t          |	          |k    sJ n|dk    rBt          j        d|dz
  t          ||          d                              t                    }nN|dk    r9t          ||          }
t          |||||
          }t          ||||          \  }}nt          |          |S )Nuniform_last_framer  rG   r   T)numr  g        )stopstepr   )r  fps)rY   rx  r  r   r  r  r   roundconcatenater^  r  r  r  NotImplementedError)r   r  r  r  r  r  r  r  indicesfloat_indicesr  r  r   s                ro   _sample_framesz#Molmo2ProcessingInfo._sample_frames  s,     4449L1$$)$455<<SAAZ!^w666+$q(J(899!	  
 &++  !#	)A-y7233! ! !
 8M"-..2BQ2FFF$&N&)9A)=(>?a% % %M (=1188==r{%55555=))Z77777"666k 1$
$455	  
 fSkk G %''#;I|#T#T "0 !$# # 8# 	 JAww &&7888rn   metadatado_sample_framesc           	      `   |                                  j        j        }|d         |                    d          }||                    dd          }|rE|d         }|z  }|j        }|j        }|j        }	|j        }
|                     |||||	|
          }n|J fd|D             }|S )Nr  frames_indicesr  Fr  c                     g | ]}|z  S rm   rm   )r   	frame_idxr  s     ro   r   z>Molmo2ProcessingInfo._get_video_second_idx.<locals>.<listcomp>  s    LLL	i)+LLLrn   )	r  rX  rj  r   r  r  r  r  r  )r   r  r  rj  r  r  r  r  r  r  r  
timestampsr  s               @ro   _get_video_second_idxz*Molmo2ProcessingInfo._get_video_second_idx  s    
 //11;KUO	!&677#'||,>FF 	.'(:;')3H / A(3J%-G*7L!00 ! NN "---LLLL^LLL
rn   r   )re   rf   rg   r  rW  r  r   r   r   r  r  r  r9   r  r  r  r   rY   r$  r  r  r   r   r  r  rm   rn   ro   r  r  G  s       < <4J < < < <
+cDj)A + + + + 48    	
 *D0 
   B 48	" " " " *D0	"
 
" " " " (9 ( ( ( (>" " " " " "
,, 38$, 
	, , , ,;; ; 	;
 ; ; ; ; 
; ; ; ;@ )-! !sCx.! +! 
e	! ! ! ! ! !rn   r  c                       e Zd Zdeeef         defdZ	 ddedeeef         deeef         dz  defdZ	ded	ed
edede
e         f
dZdS )Molmo2DummyInputsBuilderr  r   c                     |                     dd          }|                     dd          }t          t          }|dk    r}n.d                    fdt	          |          D                       }|||z  z   S )Nr!  r   r  rG   r   c                 &    g | ]}d |dz    z   S )zImage rG   rm   )r   r  image_placeholder_tokens     ro   r   z;Molmo2DummyInputsBuilder.get_dummy_text.<locals>.<listcomp>  s,    WWW!!a%!!$;;WWWrn   )r   r  r  joinr  )r   r  
num_images
num_videosvideo_placeholder_tokenimage_stringr  s         @ro   get_dummy_textz'Molmo2DummyInputsBuilder.get_dummy_text  s    ]]7A..
]]7A..
".".??2LL77WWWWU:EVEVWWW L 5
BBBrn   Nr  
mm_optionsc                    |                     dd          }|                     dd          }g }g }|dk    rN| j                                        \  }}	|r|                     d          nd }
|                     ||	||
          }|dk    r| j                                        }|j        }| j                            ||          }|r|                     d          nd }|rst          |t                    sJ |j	        }|rS||k    rt                              d||           |dk     rt                              d|           t          ||          }|                     |d         |d         ||	          }||d
S )Nr!  r   r  )r  r  r!  	overridesz]video.num_frames override (%d) exceeds model's maximum number of frames (%d), will be ignoredr  zEvideo.num_frames override (%d) cannot be less than 2, will be ignoredrG   )r  r  r  r"  r  )r   infor  _get_dummy_imagesr  ro  r  r#  r   r  loggerwarningr  _get_dummy_videos)r   r  r  r&  r!  r"  dummy_imagesdummy_videostarget_widthtarget_heightimage_overridesrX  ro  target_num_framesvideo_overridesnum_frames_overrides                   ro   get_dummy_mm_dataz*Molmo2DummyInputsBuilder.get_dummy_mm_data"  s    ]]7A..
]]7A..
>>*.)*U*U*W*W'L-9CMjnnW555O11"$%)	 2  L >>	2244I$-$C! $	 K K! ! :DMjnnW555O T!/3DEEEEE&5&@#& T*->>>M/-	   +Q..6/  
 ),,=?R(S(S%11+A.,Q/,%	 2  L "!
 
 	
rn   r  r  r  r"  c          
         t          j        |||dfdt           j                  }g }t          |          D ]U}d|dz  |t	          t          |                    dd||d}|                                |f}	|                    |	           V|S )Nr     r  g       @decordF)r  r  r  r  video_backendr  r  r  )rY   fulluint8r  r  copyr  )
r   r  r  r  r"  r  video_itemsr  video_metadata
video_items
             ro   r-  z*Molmo2DummyInputsBuilder._get_dummy_videosb  s     VUA628LLLz"" 	+ 	+A&,$."&uZ'8'8"9"9!)$) 	 	N  **,,7Jz****rn   r   )re   rf   rg   r   r   r   r%  r   r4   r6  r  r7   r-  rm   rn   ro   r  r    s        CS(9 Cc C C C C( =A	>
 >
>
 38$>
 C!112T9	>

 
>
 >
 >
 >
@  	
   
i     rn   r  c            
            e Zd Zdee         dee         fdZdefdZdede	ee
f         de	ee
f         de	ee
f         def
 fd	Zd
ede	ee
f         de	eef         fdZdede	ee
f         dedee         fdZ xZS )Molmo2MultiModalProcessorprompt_tokensr   c                     | j                                         }|j        j        }|j        p|j        }t          |          dk    r|d         |k    r|g|z   }|S ra  )r)  r  rX  r[  bos_token_ideos_token_idr^  )r   rC  rX  r[  rE  s        ro   _apply_hf_processor_tokens_onlyz9Molmo2MultiModalProcessor._apply_hf_processor_tokens_only}  sf     I..00	'1	 -G1G}!!mA&6,&F&F)N]:Mrn   c                 "    t          d          S )NT)video_needs_metadata)r;   r   s    ro   _get_data_parserz*Molmo2MultiModalProcessor._get_data_parser  s    #>>>>rn   promptmm_data	mm_kwargs
tok_kwargsc                    t          |          } | j        j        di |}|                    dg           x}rIg }g }g }	g }
g }g }g }|D ]}|\  }t          di |}d|vr                    dd          |d<   t          di fdD             t                      }|gg|d<   gg|d<   t                                          t          |||          }|                    d          }|j	        j
                            |          d         }|                    t          |d	          }|                    |d
                    |                    |d                    |	                    |d                    |
                    |d                    |                    |d                    |                    |d                    |                    |d                    t          t          j        |          t          j        |          t          j        |	          t          j        |
          t          j        |          t          j        |          t          j        |                    }nt                      }t                                          ||||          }|j        |j                 }|d         }|                                dk    rJ|d         |k    r>t          j        |gg|j        |j                  }t          j        ||gd	          |d<   t          |fi |}t/          |          S )Nr  r  Fc                 .    i | ]}|d k    ||         S )r  rm   )r   r  r  s     ro   
<dictcomp>z@Molmo2MultiModalProcessor._call_hf_processor.<locals>.<dictcomp>  s*    SSS!1@R;R;Rq(1+;R;R;Rrn   r?  )rK  rL  rM  rN  r  r   rG   rr   video_token_poolingr  r  r  rt   ru   )rr   rR  r  r  r  rt   ru   r  rs  r   rm   )r  r)  r  r  r   r   r   _call_hf_processorr  rX  r[  batch_decoder  r  ri   rj  r\  rv  numeltensorrg  r%  concatr   )r   rK  rL  rM  rN  rX  r  pixel_values_videos_lstvideo_token_pooling_lstvideo_num_crops_lstvideo_num_pooled_patches_lstvideo_num_patches_lstvideo_tokens_lstnum_video_tokens_lstr6  video_arrayvideo_mm_kwargsvideo_mm_datavideo_outputsr  video_stringprocessed_outputsrE  bos_token_id_tensorcombined_outputsr  r   s                            @ro   rS  z,Molmo2MultiModalProcessor._call_hf_processor  s    w--.DI.;;;;	[[2...6 E	#&(#&(#"$+-($&!!#%  0O 0O(,%X #'"3"3"3"3%_<< ;C,,*E; ;O$67 )  SSSSxSSS  !%,7=/h'4<:,./ % : :')-)	 !; ! ! *--k::	(2<II)TTUVW    (..}=R/STTT'..}=R/STTT#**=9J+KLLL,33!"<=   &,,];N-OPPP ''n(EFFF$++M:L,MNNNN $)I.E$F$F$)I.E$F$F %	*= > >).3O)P)P"'),A"B"B"Y'788!&+?!@!@  MM !FFM!GG66!	 7 
 
 !y':;%k2	??q  Yt_%D%D"', )9# # # .3\$i0a. . .k*  
 

 
 ,---rn   	hf_inputshf_processor_mm_kwargsc                     |                     dt          j        d                    }|                     dt          j        d                    }|                     dt          j        d                    }|                     dt          j        d                    }|                     dt          j        d                    }|                     dt          j        d                    }t          t	          j        d|          t	          j        d|          t	          j        d          t	          j        d          t	          j        d          t	          j        d|          t	          j        d          t	          j        d	|          t	          j        d	|          t	          j        d	          t	          j        d	          t	          j        d	          t	          j        d	|          t	          j        d	          
          S )Nr  r   r  r  r  rc   ru   r!  r  )r[   image_token_poolingr  r  r  rb   rc   rr   rR  r  r  r  rt   ru   )r   ri   r7  r  r5   flat_from_sizesbatched)	r   rg  rh  r  r  r  r  rc   ru   s	            ro   _get_mm_fields_configz/Molmo2MultiModalProcessor._get_mm_fields_config  s   
 $--(95;q>>JJ#,==&A$
 $
  $--(95;q>>JJ#,==&A$
 $
  %==);U[^^LL$==);U[^^LL.>  !6 E1! ! 29'BB%:%B7%K%K3;GDD.>)  3:7CC 5 E! ! !6 E1! ! 29'BB%:%B7%K%K3;GDD.>)  3:7CC5
 
 
 	
rn   mm_itemsout_mm_kwargsc                    	
   j         j        di j        	j        j        
j        j        j        j        j        j        j	        j        j
        j        j        dt          dt          t                   f	
f	d}dt          dt          t                   f	
 f
d}d t          ddgj        j        g||g          D             S )	Nitem_idxr   c                   	                      dt                    }|                    |           }t          |          }                    d          \  }}}n}rj        }n}g|z  gt          |          z  z   }|g||z  z   gz   }t          |          }	                    |	j	        |	j
                  \  }
}g|z  gt                    z  z   }g||
z  z   gz   }||z   }t          j        |j                  S )Nr!  Fr  r  )	get_itemsr8   r   r,  r  r  r   r)  r  r  r  r@   select_token_idsr  )rq  rh  r!  r  r  r  start_id	extra_rowextra_jointr  r  r   	joint_rowr  img_token_idsr  
img_col_id
img_end_idimg_patch_idimg_start_idrn  rX  r  use_single_crop_start_tokens                  ro   get_image_replacement_molmo2zSMolmo2MultiModalProcessor._get_prompt_updates.<locals>.get_image_replacement_molmo24	  sf   ''1DEEFJJx((E!%((E(1(D(De(D(T(T%L+)5!;!5* ($8'%4
|cG G 8 I $*y<'??:,NK'..J$::'.&, ;  LE5
 &.*$A A 2 I "NY%66*EE'%/M&7)  rn   c                 
  
 d         |          \  }}                     d          }j                            ||          }                    d          \  }}rj        }j        }n}}g }	t          |          D ]h\  }
}|
dk    rdnd}||ddz   }|	j        j        	                    |d	
          z  }	g|z  gt                    z  z   }|g||z  z   |gz   }|	|z  }	it          j        |	j                  S )Nr  r  Tr  r    r   z.1fF)add_special_tokens)r   r)  r  r  rL  rM  	enumeraterX  r[  encoder   r@   rt  r  )rq  r  r  r  r  r  r   ru  end_idry  r  
frame_time
prev_spaceframe_prefixrx  r  rh  rz  r{  r|  r}  rn  rX  r   use_frame_special_tokensr  s                   ro   get_video_replacement_molmo2zSMolmo2MultiModalProcessor._get_prompt_updates.<locals>.get_video_replacement_molmo2Y	  sm   &w/9OE8599:LMM88CSTTJ$777FFLE5' $$3"/'#M)2:)>)> ' '%	:$-MMSSr
J!5!5!5!55  !4!>!E!E ', "F " " 
 *NU2j\C(E E 6 	 "
UY%66&A&&7)  rn   c                 <    g | ]\  }}}t          ||g|           S ))modalitytargetreplacement)r>   )r   r  r  replacement_fns       ro   r   zAMolmo2MultiModalProcessor._get_prompt_updates.<locals>.<listcomp>	  sJ     
 
 
 1&. !x*  
 
 
rn   r!  r  rm   )r)  r  r0  rz  r}  r  rX  r  r  r~  r  r  r   r  zipr  r  )r   rn  rh  ro  r  r  r  rz  r{  r|  r}  rX  r  r  r~  r  s   ```   @@@@@@@@@@ro   _get_prompt_updatesz-Molmo2MultiModalProcessor._get_prompt_updates#	  s    /DI.HH1GHH	 /(
 ,(
(2G%.%8%S"&/&9&U#(2G#,#6#O #	3 #	49 #	 #	 #	 #	 #	 #	 #	 #	 #	 #	 #	 #	 #	 #	J$	3 $	49 $	 $	 $	 $	 $	 $	 $	 $	 $	 $	 $	 $	 $	 $	 $	L
 
 58'"/1OP-/KL5 5
 
 
 	
rn   )re   rf   rg   r  r   rG  r;   rJ  r   r   r  r   rS  r5   rm  r:   r6   r   r?   r  r   r   s   @ro   rB  rB  |  sw       Cy 
c   ?"6 ? ? ? ?f.f. f%f. 3;'	f.
 CK(f. 
f. f. f. f. f. f.P+
+
 !(V 4+
 
++	,	+
 +
 +
 +
Zg
%g
 !(V 4g
 -	g

 
,	g
 g
 g
 g
 g
 g
 g
 g
rn   rB  )r)  dummy_inputsc                       e Zd Z eddddddddd	d
dddddddddd          Zdgdgg dddgddgdZededededz  fd             Z	d!d"d#e
d$ef fd%Zed&             Zd'ededz  fd(Zd'ededz  fd)Zd'edefd*Zd+edeej        d,f         fd-Zd.edeej        d,f         fd/Zd'ededz  fd0Z	 dAdd1d2d3ej        d4edz  d5ej        dz  d6edej        f
d7Z	 	 dBd3ej        d8ej        d9edz  d:ej        dz  d'edej        fd;Z d<ej        dej        fd=Z!d>e"eeej        f                  fd?Z#de$fd@Z% xZ&S )CMolmo2ForConditionalGenerationzimage_pooling_2d.q_projzimage_pooling_2d.k_projzimage_pooling_2d.v_projzimage_pooling_2d.o_projzimage_projector.gate_projzimage_projector.up_projzimage_projector.down_projr  r2  r  r  r  rN  r  r  )zimage_pooling_2d.wqzimage_pooling_2d.wkzimage_pooling_2d.wvzimage_pooling_2d.wozimage_projector.w1zimage_projector.w3zimage_projector.w2att_projattn_outr  r  ff_projff_out	attn_normff_normzvision_backbone.zmodel.layers.zmodel.norm.)zmodel.vision_backbone.zmodel.transformer.blocks.zmodel.transformer.ln_f.)orig_to_new_substrorig_to_new_prefixrR  rV  rW  rX  rY  )r  r  r   r1  rL  r  r  r   Nc                     |                     d          rt          S |                     d          rt          S t          d          )Nr!  r  z)Only image or video modality is supported)
startswithr  r  r&  )clsr  r  s      ro   get_placeholder_strz2Molmo2ForConditionalGeneration.get_placeholder_str	  sI    w'' 	 w'' 	 DEEErn   r   r   r  r   c          
         t                                                       |j        j        }|j        }|j        j        }|| _        || _        i }t          t                    D ]$}t          |j
        |j                  ||j        <   %t          d	i |}i }t          t                    D ]$}t          |j        |j                  ||j        <   %t          d	i |}	|                     |ddh          5  t          ||	|t!          |d                    | _        d d d            n# 1 swxY w Y   |                     |          5  t'          |t!          |d                    | _        d d d            n# 1 swxY w Y   |j        | _        t/          |d          r|j        }
n|j        }
t5          |
j        |
j        |          | _        t=          |
j                  | _        | j        j         | _         d S )
Nr!  r  vision_backboner   model)r  r   r  r  rm   )!r   r   r  r  r   multimodal_configr   r   rw   r  rZ  r  r   r[  _mark_tower_modelrQ  rT   r  _mark_language_modelr  r  r0  r|  r  r  r  r/   r   ry   lm_headr,   logits_processorr  )r   r  r   r   r   r  r  r  rZ  r[  r  r   s              ro   r   z'Molmo2ForConditionalGeneration.__init__	  s   )3"/'4F!2I&& 	H 	HE!():EJ!G!GF5:((((
M** 	L 	LE!()>
!K!KF5:&0000##K'71CDD 	 	#7#F,=>>	$ $ $D 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 &&{33 	 	('#FG44  DJ	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 #16=)) 	/#/NN#.N%%&%
 
 

 !00I J J J6 	,,,s$   5'D((D,/D,%E99E= E=c                 N    t          |                                           j        S r   )next
parametersr%  r   s    ro   r%  z$Molmo2ForConditionalGeneration.dtype	  s    DOO%%&&,,rn   r  c                    |                     dd           }|d S |                     dd           }|                     dd           }|                     dd           }|                     dd           }|                     dd           }dg|                    d          d d	                                         z   }d}	|                                }
t	          |          D ]Q\  }}||	|	|z            }t          ||                   }t          j        |dk    ||z   |          |
|	|	|z   <   |	|z  }	Rt          ||
|||
          S )Nr[   rj  r  r  rb   rc   r   r   r   )r[   r^   r`   rb   rc   )	r  cumsumr8  cloner  r   ri   r}  rW   )r   r  r[   r^   r`   num_patchesrb   rc   accum_patchespatch_offsetnew_token_poolingr  n	cur_sliceindex_offsets                  ro   _parse_and_validate_image_inputz>Molmo2ForConditionalGeneration._parse_and_validate_image_input	  s{    zz.$774

#8$??#ZZ(BDIIjj!4d;;zz.$77!::&8$??k00Q077<CCEEE)//11011 	 	DAq%l\A5E&EFI}Q/00LAFQL(B Bl\A-==>
 ALL %+1%-
 
 
 	
rn   c                    |                     dd           }|d S |                     dd           }|                     dd           }|                     dd           }|                     dd           }|                     dd           }dg|                    d          d d	                                         z   }d}	|                                }
t	          |          D ]Q\  }}||	|	|z            }t          ||                   }t          j        |dk    ||z   |          |
|	|	|z   <   |	|z  }	Rt          ||
|||
          S )Nrr   rR  r  r  rt   ru   r   r   r   )rr   r^   r`   rt   ru   )	r  r  r8  r  r  r   ri   r}  rq   )r   r  rr   r^   r`   r  rt   ru   r  r  r  r  r  r  r  s                  ro   _parse_and_validate_video_inputz>Molmo2ForConditionalGeneration._parse_and_validate_video_input
  s}    %jj)>EE&4

#8$??#ZZ(BDIIjj!4d;;zz.$77!::&8$??k00Q077<CCEEE)//11011 	 	DAq%l\A5E&EFI}Q/00LAFQL(B Bl\A-==>
 ALL  3+1%-
 
 
 	
rn   c                 t    i }|D ]2}|dv rd|vr | j         di ||d<   |dv rd|vr | j        di ||d<   3|S )N)r[   rh  )rr   r  rm   )r  r  )r   r  
modalities	input_keys       ro   %_parse_and_validate_multimodal_inputszDMolmo2ForConditionalGeneration._parse_and_validate_multimodal_inputs?
  s    
 	V 	VI---(*2L2L'Kt'K'U'Uf'U'U
8$4449S9S'Kt'K'U'Uf'U'U
8$rn   image_input.c                    |d         }|d         }|d         }|d         }|d         }|                      |                    d          |                    d                    }t          |          |                                k    sJ |                    |                                d          }|                    |                                d          }	g }
t          ||	          D ]Q\  }}|                                                     |          }|| j	        k    }|||<   |

                    |           Rt          |
          S )	Nr[   r^   r`   rb   rc   r   rh  r^   r   r  r  r^  r|  r   r8  r  get_language_modelr  r|  r  r   )r   r  r[   r^   r`   rb   rc   image_features_flatimage_features_listimage_tokens_listrB  image_features_iimage_tokens_iout_featuresis_image_patchs                  ro   _process_image_inputz3Molmo2ForConditionalGeneration._process_image_inputI
  sw    #>2#O4()=>">2&'9:"22))!,,'11!44 3 
 

 &''+=+A+A+C+CCCCC177%%''Q 8 
 
 )../?/F/F/H/Ha.PP03!21
 1
 	% 	%,n  2244DD^TTL+t/@@N+;L(JJ|$$$$Szzrn   video_inputc                    |d         }|d         }|d         }|d         }|d         }|                      |                    d          |                    d                    }t          |          |                                k    sJ |                    |                                d          }|                    |                                d          }	g }
t          ||	          D ]Q\  }}|                                                     |          }|| j	        k    }|||<   |

                    |           Rt          |
          S )	Nrr   r^   r`   rt   ru   r   r  r   r  )r   r  rr   r^   r`   rt   ru   r  r  video_tokens_listrB  r  video_tokens_ir  r  s                  ro   _process_video_inputz3Molmo2ForConditionalGeneration._process_video_inputg
  sy    **?@#O4()=>">2&'9:"22&0033'11!44 3 
 

 &''+=+A+A+C+CCCCC177%%''Q 8 
 
 )../?/F/F/H/Ha.PP03!21
 1
 	% 	%,n  2244DD^TTL+t/@@N+;L(JJ|$$$$Szzrn   c                      | j         di |}|sg S d}|D ]R}|dk    r"|d         }|                     |          }||z  }|dk    r"|d         }|                     |          }||z  }S|S )Nrm   rh  r  )r  r  r  )	r   r  r  multimodal_embeddingsr  r  image_embeddingsr  video_embeddingss	            ro   embed_multimodalz/Molmo2ForConditionalGeneration.embed_multimodal
  s    ?T?II&II
 	I:<" 	: 	:H8##(2#'#<#<[#I#I %)99%8##(2#'#<#<[#I#I %)99%$$rn   Fis_multimodalhandle_oov_mm_tokenr  r  r  r  c                    |                      ||                                 j        ||          }|t          |          dk    r|S |t	          d          t          |||          }|S )Nr  r   z`embed_input_ids` now requires `is_multimodal` arg, please update your model runner according to https://github.com/vllm-project/vllm/pull/16229.)r  r  r  )_embed_text_input_idsr  r  r^  r&  rO   )r   r  r  r  r  r  s         ro   r  z.Molmo2ForConditionalGeneration.embed_input_ids
  s     22##%%5' 3	 3 
 
 !(C0E,F,F!,K,K   C   5'"7'
 
 

 rn   r  r  r  c                 2    |d } | j         |||fd|i|}|S )Nr  )r  )r   r  r  r  r  r  r  s          ro   r   z&Molmo2ForConditionalGeneration.forward
  sL      + M"
 
 
 (	

 
 
 rn   r  c                 <    |                      | j        |          }|S r   )r  r  )r   r  logitss      ro   compute_logitsz-Molmo2ForConditionalGeneration.compute_logits
  s    &&t|]CCrn   r  c                 v    t          |           }t          |          }|                    || j                  S )N)mapper)rM   "_get_weights_with_merged_embeddingr  hf_to_vllm_mapper)r   r  loaders      ro   r  z+Molmo2ForConditionalGeneration.load_weights
  s8    "4((4W==""743I"JJJrn   c                 0    t          j        ddd          S )z<
        Get the module prefix in multimodal models
        r  zvision_backbone.image_projectorr  )language_model	connectortower_model)r2   from_string_fieldr   s    ro   get_mm_mappingz-Molmo2ForConditionalGeneration.get_mm_mapping
  s'     /"7)
 
 
 	
rn   r   r  )'re   rf   rg   rN   r  r  classmethodr   r   r  r   r   r   r%  r  rW   r  rq   r  r  r  r   ri   rj   r  r  rH   r  r   r  
LongTensorrB   r   r  r   r  r2   r  r   r   s   @ro   r  r  	  s        & $=#<#<#<"=";"=" %!*1#
 
* '9)8'4
 
)  <  L'(((()%y1  F3 F3 F3: F F F [F BD 0
 0
 0
z 0
3 0
 0
 0
 0
 0
 0
d - - X-!
!
 
T	!!
 !
 !
 !
F!
!
 
T	!!
 !
 !
 !
Ff     & 
u|S 	!   <& 
u|S 	!   <% %4H44O % % % %, >B
 .2$)  <  4d:
 |d* " 
   H <@-1 # # 2D8	
 |d*  
   *EL U\    KHU33D-E$F K K K K

 
 
 
 
 
 
 
 
rn   r  r  c              #      K   i }| D ]\  }}d|v r||d<   d|v r||d<   ||fV   d|vsd|vrt          d          t          j        |d         |d         gd          }d|fV  d S )	Nzwte.embedding	embeddingzwte.new_embeddingnew_embeddingzYCheckpoint is missing 'wte.embedding' or 'wte.new_embedding' weights required for Molmo2.r   r   zmodel.embed_tokens.weight)r&  ri   rj  )r  embedding_weightsr  re  s       ro   r  r  
  s        ! !fd""-3k** D((17o...     +++FW/W/W?
 
 	

 		;	'):?)KL   '(9
::::::rn   )r  collections.abcr   r   r   dataclassesr   r   	functoolsr   r	   	itertoolsr
   typingr   r   numpyrY   ri   torch.nnr   torch.nn.functional
functionalr!  PILr   	PIL.Imager   transformersr   r   r   r   transformers.image_utilsr   $transformers.tokenization_utils_baser   transformers.video_utilsr   r   vllm.attention.layerr   vllm.compilation.decoratorsr   vllm.configr   r   vllm.config.multimodalr   r   vllm.distributedr   r   r   r    r!   vllm.loggerr"   %vllm.model_executor.layers.activationr#   r$   r%   9vllm.model_executor.layers.attention.mm_encoder_attentionr&   $vllm.model_executor.layers.layernormr'   !vllm.model_executor.layers.linearr(   r)   r*   r+   +vllm.model_executor.layers.logits_processorr,   'vllm.model_executor.layers.quantizationr-   +vllm.model_executor.layers.rotary_embeddingr.   3vllm.model_executor.layers.vocab_parallel_embeddingr/   r0   -vllm.model_executor.model_loader.weight_utilsr1   )vllm.model_executor.models.module_mappingr2   vllm.multimodalr3   vllm.multimodal.inputsr4   r5   r6   r7   vllm.multimodal.parser8   r9   r:   r;   vllm.multimodal.processingr<   r=   r>   r?   r@   'vllm.multimodal.processing.dummy_inputsrA   vllm.sequencerB   vllm.utils.math_utilsrD   vllm.utils.tensor_schemarE   rF   
interfacesrH   rI   rJ   rK   rL   utilsrM   rN   rO   rP   rQ   rR   rS   rT   re   r+  r  r  _MAX_VIDEO_FPSrW   rq   rw   r   r   Moduler   r   r   r   r	  r,  rI  rQ  r  r  r  r  r  r   r   r  r  r
  r   r)  r,  r  rJ  rU  rW  r   r  r   r  r  r  r  rB  register_processorr  rj   r  rm   rn   ro   <module>r     s    7 7 7 7 7 7 7 7 7 7 ) ) ) ) ) ) ) ) . . . . . . . .       ! ! ! ! ! ! ! !                                            0 / / / / / : : : : : : > > > > > > > > * * * * * * = = = = = = / / / / / / / / F F F F F F F F              $ # # # # # T T T T T T T T T T X X X X X X 8 8 8 8 8 8            H G G G G G F F F F F F @ @ @ @ @ @        P O O O O O D D D D D D / / / / / /                                    K J J J J J - - - - - - , , , , , , > > > > > > > >             	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 
X		
 A A A A A A A A6A A A A A A A A6 F F F F F F F F0 ! ! ! ! ! ! ! ! R R R R R R R Rj! ! ! ! !RY ! ! !HG G G G Gbi G G GT% % % % %	 % % %P    ")   :I I I I Ibi I I IX@ @ @ @ @BI @ @ @F& & & & &	 & & &Rm m m m m29m m m m`O O O O Obi O O Od$ $ $ $ $ry $ $ $N7' 7' 7' 7' 7' 7' 7' 7't' ' ' ' '"4 ' ' '6 j j j j jbi j j jZ  	
   38_   $>3 >4c3h+@ > > > >!! ! 	!
 ! ! ! !.	?* 	? 	? 	? 	? 	?$   =!==  = 	=
 = = 5U--.= = = =@%!%% % 	%
 5U--.% % % %Pq% q% q% q% q% q% q% q%n	 *- -U{-+- 5[- 
%[	- - - -`$$$ $ 	$
 u+$ T\$ $ $ $N. . .G G G G G- G G GTh h h h h56JK h h hVN
 N
 N
 N
 N
 78L M N
 N
 N
b ('	)  
L
 L
 L
 L
 L
I!:|]L
 L
 
L
^
;eC-./;eC%&'; ; ; ; ; ;rn   