
    .`i                        d dl Z d dlmZmZmZ d dlmZ d dlmZm	Z	 d dl
mZ d dlmZ d dlZd dlZd dlmZ d dlmc mZ d dlmZ d dlmZmZmZmZ d d	lmZ d d
lm Z  d dl!m"Z" d dl#m$Z$ d dl%m&Z&m'Z' d dl(m)Z) d dl*m+Z+m,Z,m-Z-m.Z.m/Z/ d dl0m1Z1m2Z2m3Z3 d dl4m5Z5 d dl6m7Z7 d dl8m9Z9m:Z:m;Z;m<Z< d dl=m>Z> d dl?m@Z@ d dlAmBZB d dlCmDZDmEZE d dlFmGZG d dlHmIZI d dlJmKZK d dlLmMZMmNZNmOZO d dlPmQZQmRZRmSZS d dlTmUZUmVZVmWZWmXZXmYZYmZZZm[Z[ d dl\m]Z] d dl^m_Z_m`Z` d d!lambZbmcZcmdZdmeZemfZf d d"lgmhZhmiZimjZjmkZkmlZlmmZm d#d$gZnd Zod%Zpd&Zqd'Zrd(Zsd)Ztd*Zu G d+ d,e_          Zve G d- d.                      Zw G d/ d0ejx                  Zy G d1 d2ejx                  Zz G d3 d4ejx                  Z{ G d5 d6ejx                  Z|d7ej}        d8e~d9ej}        fd:Z G d; d<ejx                  Z G d= d>ejx                  Z G d? d@ejx                  Z G dA dBejx                  Z G dC dDejx                  Z G dE dFe          Z G dG dHejx        ef          Ze$ G dI dJejx        ef                      ZdKe~dLe~d9e~fdMZdNe~dOe~dPe~dQe~dRe~d9e~fdSZdTe~dUe~dOe~dPe~dQe~dRe~d9ee~e~f         fdVZdWe~d9eee~e~f                  fdXZdYe~dZe~d[e~d\e~fd]Z G d^ d_          Z G d` daeW          Z G db dceUe                   Z G dd deeVe                   Z eKj        eeef           G dg dhejx        edeeecef                      Zdieeeej}        f                  d9eeeej}        f                  fdjZdS )k    N)IterableMappingSequence)	dataclass)cached_propertypartial)islice)	Annotated)	rearrange)BatchFeaturePretrainedConfigProcessorMixin
TensorType)
ImageInput)	TextInput)	Attention)support_torch_compile)CacheConfig
VllmConfig)BaseDummyOptions)get_pp_groupget_tensor_model_parallel_rank$get_tensor_model_parallel_world_sizesplit_tensor_along_last_dim tensor_model_parallel_all_gather)
MulAndSilu	QuickGELU
SiluAndMul)MMEncoderAttention)RMSNorm)ColumnParallelLinearMergedColumnParallelLinearQKVParallelLinearRowParallelLinear)LogitsProcessor)QuantizationConfig)get_rope)ParallelLMHeadVocabParallelEmbedding)default_weight_loader)MultiModelKeys)MULTIMODAL_REGISTRY)MultiModalDataDictMultiModalFieldConfigMultiModalKwargsItems)ImageProcessorItems	ImageSizeMultiModalDataItems)BaseDummyInputsBuilderBaseMultiModalProcessorBaseProcessingInfoPromptIndexTargetsPromptInsertionPromptUpdatePromptUpdateDetails)IntermediateTensors)TensorSchemaTensorShape   )MultiModalEmbeddingsSupportsLoRASupportsMultiModal
SupportsPPSupportsQuant)AutoWeightsLoaderWeightsMapperis_pp_missing_parameter'make_empty_intermediate_tensors_factorymake_layersmaybe_prefixi   z
<im_patch>z<im_col>z
<im_start>z<im_end>   c                       e Zd ZU dZeej         eddd          f         ed<   eej        dz   edd          f         ed<   eej         edd          f         ed	<   	 eej         ed
          f         ed<   dS )MolmoImageInputsz
    Dimensions:
        - bn: Batch size * number of images
        - bnc: Batch size * number of images * number of crops (dynamic)
        - np: Number of patches
        - tp: Token sequence positions
        - pd: Patch dimension
    bncnppdimagesNimage_maskstpimage_input_idxbn	num_crops)	__name__
__module____qualname____doc__r
   torchTensorr<   __annotations__     t/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/vllm/model_executor/models/molmo.pyrM   rM   _   s           elKKtT$B$BBCCCC5<$.E40H0HHIIIIu|[[-E-EEFFFFW{{4'8'88999999r_   rM   c                       e Zd ZU dZeeef         ed<   dZeed<   dZeed<   dZ	eed<   dZ
eed	<   dZeed
<   dZeed<   dZeed<   dZeed<   dZeed<   dZeed<   d Zed             ZdS )VisionBackboneConfig)P  rc   image_default_input_size   image_patch_sizeimage_pos_patch_sizei   image_emb_dim   image_num_headsimage_num_key_value_heads   image_num_layersi   image_mlp_dim
quick_geluimage_mlp_activationsiA  image_num_posgh㈵>image_norm_epsc                 8    t          | j                  | _        d S N)tuplerd   selfs    r`   __post_init__z"VisionBackboneConfig.__post_init__   s    (-d.K(L(L%%%r_   c                 >    | j         \  }}|| j        z  || j        z  fS rt   )rd   rf   )rw   hws      r`   image_num_patchz$VisionBackboneConfig.image_num_patch   s)    ,1D))10E+EEEr_   N)rW   rX   rY   rd   ru   intr]   rf   rg   rh   rj   rk   rm   rn   rp   strrq   rr   floatrx   propertyr|   r^   r_   r`   rb   rb   s   s        0:eCHo:::c "#"""M3OS%'s'''cM3!-3---M3 NE   M M M F F XF F Fr_   rb   c                   `     e Zd ZdZ	 	 ddededz  def fdZdej	        d	ej	        fd
Z
 xZS )ViTMLPzMLP used in Vision Transformer.N configquant_configprefixc                     t                                                       t          |j        |j        d|| d          | _        |j        dk    sJ t                      | _        t          |j        |j        d|| d          | _
        d S )NTz.w1biasr   r   ro   z.w2)super__init__r!   rh   rn   w1rp   r   actr$   w2rw   r   r   r   	__class__s       r`   r   zViTMLP.__init__   s     	&  %>>>
 
 
 +|;;;;;;#  %>>>
 
 
r_   xreturnc                     |                      |          \  }}|                     |          }|                     |          \  }}|S rt   )r   r   r   )rw   r   _s      r`   forwardzViTMLP.forward   s<    wwqzz1HHQKKwwqzz1r_   Nr   rW   rX   rY   rZ   rb   r&   r~   r   r[   r\   r   __classcell__r   s   @r`   r   r      s        ))
 37	
 
$
 )4/
 	
 
 
 
 
 
2 %,        r_   r   c                        e Zd ZdZ	 	 	 	 ddededed	edz  d
ef
 fdZ		 dde
j        de
j        dz  de
j        fdZ xZS )MultiHeadDotProductAttentionz0Multi-head attention used in Vision Transformer.Tr=   Nr   r   use_biasnlayersr   r   c                    t                                                       |j        | _        |j        | _        t                      }| j        | j        z  dk    sJ | j        |z  dk    sJ | j        |z  | _        | j        | j        z  | _        |j	        | _
        | j
        |k    r| j
        |z  dk    sJ n|| j
        z  dk    sJ t          d| j
        |z            | _        t          || j        z  | j        | j        z  ||| d          | _        t          || j        z  | j
        | j        z  ||| d          | _        t          || j        z  | j
        | j        z  ||| d          | _        t#          | j        | j        z  | j        ||| d          | _        | j        dz  | _        t)          | j        | j        | j        | j        	          | _        d S )
Nr   r=   z.wqr   z.wkz.wvz.wo      )num_kv_heads)r   r   rh   hidden_sizerj   total_num_headsr   	num_headshead_dimrk   total_num_kv_headsmaxr   r!   wqwkwvr$   woscaler   attn)rw   r   r   r   r   r   tp_sizer   s          r`   r   z%MultiHeadDotProductAttention.__init__   s+    	!/%5688$"66!;;;;#g-2222-8(D,@@"("B"g--*W499999T4499994#:g#EFF&d&& 4=0%>>>
 
 
 'd&&#dm3%>>>
 
 
 'd&&#dm3%>>>
 
 
 $ 4=0%>>>
 
 
 ]D(
&NDM4:DDU
 
 
			r_   inputs_q	inputs_kvr   c                 
   ||}|}n|}|}|                      |          \  }}|                     |          \  }}|                     |          \  }}|                     |||          }	|                     |	          \  }	}|	S rt   )r   r   r   r   r   )
rw   r   r   inputs_kinputs_vxqr   xkxvoutputs
             r`   r   z$MultiHeadDotProductAttention.forward   s       H HHHH!!A!!A!!A2r2&&GGFOO	r_   )Tr=   Nr   rt   )rW   rX   rY   rZ   rb   boolr}   r&   r~   r   r[   r\   r   r   r   s   @r`   r   r      s        ::
 26<
 <
$<
 <
 	<

 )4/<
 <
 <
 <
 <
 <
 <
~ HL 161D	       r_   r   c                   `     e Zd ZdZ	 	 ddededz  def fdZdej	        d	ej	        fd
Z
 xZS )ResidualAttentionBlockz4Residual attention block used in Vision Transformer.Nr   r   r   r   c                 D   t                                                       t          ||| d          | _        t	          ||| d          | _        t          j        |j        |j	                  | _
        t          j        |j        |j	                  | _        d S )Nz
.attentionr   r   z.feed_forwardr   eps)r   r   r   	attentionr   feed_forwardnn	LayerNormrh   rr   attention_normffn_normr   s       r`   r   zResidualAttentionBlock.__init__  s     	56K6K6K
 
 
 #LF)A)A)A
 
 
 !l %
 
 
  %
 
 
r_   r   r   c                     ||                      |                     |                    z   }||                     |                     |                    z   }|S rt   )r   r   r   r   )rw   r   s     r`   r   zResidualAttentionBlock.forward  sO    t22155666!!$--"2"2333r_   r   r   r   s   @r`   r   r     s        >>
 37	
 
$
 )4/
 	
 
 
 
 
 
, %,        r_   r   c                   l     e Zd ZdZ	 	 ddededz  def fdZdej	        d	e
ej	                 fd
Z xZS )BlockCollectionzCCollection of residual attention blocks used in Vision Transformer.Nr   r   r   r   c                     t                                                       t          j        fdt	          j                  D                       | _        d S )Nc           	      >    g | ]}t           d |           S )z.resblocks.r   )r   ).0ir   r   r   s     r`   
<listcomp>z,BlockCollection.__init__.<locals>.<listcomp>,  sO         'LF1J1Jq1J1J    r_   )r   r   r   
ModuleListrangerm   	resblocksr   s    ```r`   r   zBlockCollection.__init__$  sr     	      v677	  
 
r_   r   r   c                 ^    g }| j         D ]"} ||          }|                    |           #|S rt   )r   append)rw   r   hidden_statesrs       r`   r   zBlockCollection.forward4  sB     	$ 	$A!A  ####r_   r   )rW   rX   rY   rZ   rb   r&   r~   r   r[   r\   listr   r   r   s   @r`   r   r   !  s        MM
 37	
 
$
 )4/
 	
 
 
 
 
 
  $u|*<        r_   r   token
batch_sizer   c                 Z    |                      ddd                              |dd          S )Nr=   )viewexpand)r   r   s     r`   _expand_tokenr   <  s*    ::aB&&z2r:::r_   c                        e Zd ZdZ	 	 ddededz  def fdZdej	        d	e
d
ej	        fdZ	 ddej	        d	e
dz  d
eej	                 fdZ xZS )VisionTransformerz+Vision Transformer used in Vision Backbone.Nr   r   r   r   c                 >   t                                                       |j        dz  }|j        | _        t          j        t          j        |j                  |z            | _	        t          | _        t          j        t          j        |j        |j                  |z            | _        |j        }t          j        ||z  dz  |j        d          | _        t          j        |j        |j                  | _        t)          ||| d          | _        d S )Nr      F)r   r   z.transformerr   )r   r   rh   r|   	patch_numr   	Parameterr[   randnclass_embeddingNUM_PREFIX_TOKENSnum_prefix_tokensrq   positional_embeddingrf   Linearpatch_embeddingr   rr   pre_lnr   transformer)rw   r   r   r   r   rf   r   s         r`   r   zVisionTransformer.__init__C  s    	$d*/!|EK8L,M,MPU,UVV&7$&LK,f.BCCeK%
 %
! "2!y//!3  
  
  

 l6#7V=RSSS*LF)@)@)@
 
 
r_   r   r   r   c           	      <   | j         dd         }| j         dd          }|                    t          t          j        |j        d                             t          t          j        |j        d                             |j        d         f          }|\  }}|j        d         |k    s|j        d         |k    rq|                    d                              dddd          }t          j	        |||fddd          }|                    dddd          
                    d          }|                    d	|j        d	                   }|t          j        |d d d d d f         |d d d d d f         gd
                              |j                  z   }|S )Nr   r=   r   rK   bicubicFT)sizemodealign_corners	antialiasr   dim)r   reshaper}   mathsqrtshape	unsqueezepermuteFinterpolatesqueezer[   cattodtype)rw   r   r   cls_embpos_embpatch_num_0patch_num_1s          r`   add_pos_embzVisionTransformer.add_pos_emb\  s   +AaC0+ABB///DIgmA.//00DIgmA.//00a 
 
 &/"k={**gmA.>+.M.M''**221aA>>Gm!;/#  G ooaAq1199!<<G//"gmB&788	74AAA:.aaa
0CD!LLLOOPQPWXXXr_   c                 t   || j         }|j        \  }}}|                     |          }t          j        t          | j        |j        d                                       |j                  |gd          }| 	                    ||          }| 
                    |          }|                     |          }|S )z>
        : param x: (batch_size, num_patch, n_pixels)
        Nr   r=   r   )r   r   r   r[   r   r   r   r   r   r  r   r   )rw   r   r   BNDr   s          r`   r   zVisionTransformer.forwardz  s     I'1a  ## I4/<<??HH!LRS
 
 
 Q	**KKNN((++r_   r   rt   )rW   rX   rY   rZ   rb   r&   r~   r   r[   r\   r}   r  r   r   r   r   s   @r`   r   r   @  s        55
 37	
 
$
 )4/
 	
 
 
 
 
 
2U\ c el    > 8< *-*	el	       r_   r   c                        e Zd ZdZ	 	 	 ddededz  dedz  deddf
 fd	Zd
e	j
        de	j
        dee	j
        e	j
        f         fdZde	j
        de	j
        de	j
        fdZ xZS )MolmoAttentionzMolmo's LLM attention.Nr   r   cache_configr   r   r   c           
         t                                                       |j        | _        t                      | _        |j        | _        | j        | j        z  dk    sJ | j        | j        z  dk    sJ | j        | j        z  | _        |j        p| j        | _	        | j	        | j        k    r| j	        | j        z  dk    sJ n| j        | j	        z  dk    sJ t          d| j	        | j        z            | _        | j        | j        z  | _        | j        | j        z  | _        | j        | j        z  | _        |j        | _        t!          | j        | j        | j        | j	        |j        || d          | _        d | _        d | _        d | _        |j        r[t/                      | _        t1          | j	        | j        z  |j                  | _        t1          |j        |j                  | _        t5          | j        | j        |j                  | _        | j        dz  | _        t=          | j        | j        | j        | j        ||| d	          | _        tA          | j        | j        z  | j        d
|| d          | _!        d S )Nr   r=   z	.qkv_projr   r   )max_positionrope_parametersr   z.attn)r   r
  r   r   Fz.o_proj)"r   r   r   r   r   num_attention_headsr   r   num_key_value_headsr   r   r   r   q_sizekv_sizemax_position_embeddingsr#   qkv_biasqkv_projtp_rankk_normq_normattention_layer_normr   r    layer_norm_epsr'   r  
rotary_embscalingr   r   r$   o_projrw   r   r
  r   r   r   s        r`   r   zMolmoAttention.__init__  s    	!-;==%9$"66!;;;;#dl2a7777-="("<"T@T"dl22*T\9Q>>>>><$"99Q>>>>4#:dl#JKK(D,@@nt}4(4=8'-'E$ *M #%'''
 
 
 $((,(,& 	Q9;;DL!'$-7V=R  DK "&"4&:OPPPDK #M5"2
 
 

 }d*NML*%%###
 
 
	 ( 4=0%%%%
 
 
r_   qkc                    | j         dk    rBt          |                                          }t          |                                          }|                     |          }|                     |          }| j         dk    rGt          t          | j                   } ||          | j                 } ||          | j                 }||fS )Nr=   )num_partitions)r   r   
contiguousr  r  r   r   r  )rw   r  r  splitters       r`   _apply_qk_normzMolmoAttention._apply_qk_norm  s     <!0@@A0@@AKKNNKKNN<!:4<XXXHDL)ADL)A!tr_   	positionsr   c                 p   |                      |          \  }}|                    | j        | j        | j        gd          \  }}}| j         | j        |                     ||          \  }}|                     |||          \  }}|                     |||          }| 	                    |          \  }	}|	S )Nr   r   )
r  splitr  r  r  r  r$  r  r   r  )
rw   r%  r   qkvr   r  r  vattn_outputr   s
             r`   r   zMolmoAttention.forward  s    
 }--Q))T[$,E2)NN1a;"t{'>&&q!,,DAqy!Q//1ii1a((KK,,	r_   NNr   )rW   rX   rY   rZ   r   r   r&   r~   r   r[   r\   ru   r$  r   r   r   s   @r`   r	  r	    s         
 ,026I
 I
 I
 "D(I
 )4/	I

 I
 
I
 I
 I
 I
 I
 I
V"',	u|U\)	*   < | 
	       r_   r	  c                   p     e Zd ZdZ	 	 	 ddededz  dedz  deddf
 fd	Zd
e	j
        de	j
        fdZ xZS )LanguageModelMLPzMolmo's LLM mlp.Nr   r   	input_dimr   r   r   c                 H   t                                                       |j        | _        |j        dz  | _        t	          |p| j        | j        gdz  d|| d          | _        t                      | _        t          | j        | j        d|| d          | _	        d S )NrK   Fz.gate_up_projr   
.down_proj)
r   r   r   intermediate_sizer"   gate_up_projr   act_fnr$   	down_projrw   r   r.  r   r   r   s        r`   r   zLanguageModelMLP.__init__   s     	!-!'!9Q!>6))#$q(%+++
 
 
 !ll*"%(((
 
 
r_   r   c                     |                      |          \  }}|                     |          }|                     |          \  }}|S rt   )r2  r3  r4  rw   r   gate_upr   s       r`   r   zLanguageModelMLP.forward  sF     &&q))
KK  ~~a  1r_   r+  rW   rX   rY   rZ   r   r}   r&   r~   r   r[   r\   r   r   r   s   @r`   r-  r-    s        
 !%26
 
 
 :
 )4/	

 
 

 
 
 
 
 
:< 
       r_   r-  c                   p     e Zd ZdZ	 	 	 ddededz  dedz  deddf
 fd	Zd
e	j
        de	j
        fdZ xZS )ImageProjectorMLPzMolmo's image_projector mlp.Nr   r   r.  r   r   r   c                 H   t                                                       |j        | _        |j        dz  | _        t	          |p| j        | j        gdz  d|| d          | _        t                      | _        t          | j        | j        d|| d          | _	        d S )NrK   Fz.merged_linearr   r0  )
r   r   r   r1  r"   merged_linearr   r3  r$   r4  r5  s        r`   r   zImageProjectorMLP.__init__*  s     	!-!'!9Q!>7))#$q(%,,,
 
 
 !ll +"%(((
 
 
r_   r   c                     |                      |          \  }}|                     |          }|                     |          \  }}|S rt   )r=  r3  r4  r7  s       r`   r   zImageProjectorMLP.forwardH  sF     ''**
KK  ~~a  1r_   r+  r9  r   s   @r`   r;  r;  '  s        &&
 !%26
 
 
 :
 )4/	

 
 

 
 
 
 
 
<< 
       r_   r;  c                        e Zd Z	 	 	 ddededz  dedz  deddf
 fdZd	ej	        d
ej	        dej	        dz  de
ej	        e
ej	        ej	        f         dz  f         fdZ xZS )MolmoDecoderLayerNr   r   r
  r   r   r   c                 L   t                                                       t          |||| d          | _        t	          ||| d          | _        |j        dk    sJ t          |j        |j	                  | _
        t          |j        |j	                  | _        d S )Nz
.self_attnr   z.mlpr   rmsr   )r   r   r	  	self_attnr-  mlplayer_norm_typer    r   r  input_layernormpost_attention_layernormr  s        r`   r   zMolmoDecoderLayer.__init__S  s     	'L,&7L7L7L
 
 

 $ooo
 
 

 %....&v'9v?TUUU(/F$9)
 )
 )
%%%r_   r%  r   residualc                     ||}|                      |          }n|                      ||          \  }}|                     ||          }|                     ||          \  }}|                     |          }||fS N)r%  r   )rF  rC  rG  rD  rw   r%  r   rH  s       r`   r   zMolmoDecoderLayer.forwardl  s     $H 00??MM&*&:&:=(&S&S#M8' ' 
 

 #'"?"?x"X"Xx//h&&r_   r+  )rW   rX   rY   r   r   r&   r~   r   r[   r\   ru   r   r   r   s   @r`   r@  r@  R  s         ,026
 
 
 "D(
 )4/	

 
 

 
 
 
 
 
2'<' |' ,%	'
 
u|U5<#=>EE	F' ' ' ' ' ' ' 'r_   r@  c                       e Zd Zdej        dej        dej        dz  deej        eej        ej        f         dz  f         fdZdS )MolmoDecoderNormAfterLayerr%  r   rH  Nr   c                     |}|                      ||          }|                     |          }||z   }|}|                     |          }|                     |          }||z   }d }||fS rJ  )rC  rF  rD  rG  rK  s       r`   r   z"MolmoDecoderNormAfterLayer.forward  s     !' ' 
 

 ,,];;%0 //55mDD%0h&&r_   )rW   rX   rY   r[   r\   ru   r   r^   r_   r`   rM  rM    sx        '<' |' ,%	'
 
u|U5<#=>EE	F' ' ' ' ' 'r_   rM  c                   0    e Zd ZdddgiZ	 	 ddedededz  d	ed
df
 fdZe	d
e
j        fd            Ze	d
e
j        fd            Zde
j        d
e
j        fdZde
j        de
j        d
e
j        fdZdeeee
j        f                  d
ee         fdZ xZS )MolmoVisionBackboner=  	gate_projup_projNr   r   vision_configr   r   r   c                    t                                                       t          | _        |j        | _        | j        d         dz   t
          z  | j        d         dz   t
          z  f| _        t          ||| d          | _        | j        j	        | _	        | j	        dv s
J d            t          |t          | j                  || d          | _        t          ||j        || d	
          | _        |j        t          | j                  z  }t!          j        t%          j        d|f                    | _        d S )Nr   r=   z
.image_vitr   >   r   r=   z'Only 0 or 1 prefix tokens are supportedz.image_pooling_2d)r   r   r   z.image_projector)r.  r   r   rK   )r   r   
VIT_LAYERS
vit_layersr|   POOLING_SIZEllm_patches_per_cropr   	image_vitr   r   lenimage_pooling_2dr;  rh   image_projectorr   r   r[   zeros	pad_embed)rw   r   rS  r   r   	image_dimr   s         r`   r   zMolmoVisionBackbone.__init__  s\    	$,<!!$q(\9!!$q(\9%
! +=R=R=R
 
 
 "&!A%///5 0// !=((%///	!
 !
 !
  1#1%...	 
  
  
 "/#do2F2FF	ek1i.&A&ABBr_   c                 .    | j         j        j        j        S rt   )rY  r   weightr   rv   s    r`   r   zMolmoVisionBackbone.dtype  s    ~-4::r_   c                 .    | j         j        j        j        S rt   )rY  r   ra  devicerv   s    r`   rc  zMolmoVisionBackbone.device  s    ~-4;;r_   rQ   c                    |j         \  }}}}t          j        |                    ||z  ||          dk    dd           }|                    ||z  ||          }|                     |          }| j        >g }| j        D ]}	|                    ||	                    t          j        |d          }n|d         }| j        dk    r|ddddf         }||z  }|                    |||d          }|S )	zN
        : param images: (batch_size, num_crops, num_patch, n_pixels)
        r   )r=   rK   T)r   keepdimNr   r   r=   )	r   r[   allr   rY  rV  r   r   r   )
rw   rQ   r  Tr  r  maskimage_featuresfeatureslayers
             r`   encode_imagez MolmoVisionBackbone.encode_image  s    \
1a	&++a!eQ22b8fdSSSSQUAq))//?&H 7 7u 56666"YxR888NN+B/N!A%%+AAAqrrE2N'$.',,Q1b99r_   rR   c                    |j         d d         \  }}|                    | j        | j                  }|                     |          }|j        }|J | j        d d d d d d d f         }|dk    }t          j        |dk     t          j        |                                        t          j	                  }	|                    t          j	                  }||d         t          j
        |d          z  z   }||d         t          j
        |	d          z  z   }|                    |          }|                    ||f| j        z   dz             }| j        d         t          z  x}
rt          j        |ddd|
d|
ddddf
          }t!          |dt          t          	          }|                    d
d          }|                     ||          }| j        \  }}|                    ||||z  d          }|                     |          }|S )NrK   )rc  r   r   r=   r   r   )r   z*b n (h dh) (w dw) c -> (b n h w) (dh dw) c)dhdwrI   T)re  )r   r   rc  r   rl  r^  r[   logical_andlogical_notfloat32r   r   r|   rW  r   padr   meanr[  rX  r   r\  )rw   rQ   rR   r   	num_imageri  og_dtyper^  all_padpartial_pad	missing_wqueryrz   r{   s                 r`   r   zMolmoVisionBackbone.forward  s9    !'RaR 0
I$+TZ@@**622!'&&&N111dD$#9:	"'a9J79S9STTWW- X 
 
 **5=*11')A,RT9U9U*UU')A,:
 :
 +
 
 (**844'//#d&::UB
 
 ,Q/,>>9 	UAq)Q	1aA> N #8	
 
 
 ##B#55..unEE(1',,ZAE2NN--n== r_   weightsc                 (   ddg}t          |                                           }t                      }|D ]\  }}|D ]i\  }}}	||vr|                    ||          }|                    d          r||vr;t          ||           rL||         }
|
j        } ||
||	            nU|                    d          r||vrt          ||           r||         }
t          |
dt                    } ||
|           |	                    |           |S )N)r=  rQ  r   )r=  rR  r=   .biasweight_loader)
dictnamed_parameterssetreplaceendswithrE   r  getattrr*   add)rw   r|  stacked_params_mappingparams_dictloaded_paramsnameloaded_weight
param_nameweight_nameshard_idparamr  s               r`   load_weightsz MolmoVisionBackbone.load_weights  sc    .+"

 4002233"%%%#* 	$ 	$D-5K 4 41
Kd**||K<<==)) d+.E.E*466 #D) % 3e]H=====)) d+.E.E*466 #D) '@U V Ve]333d####r_   r   )rW   rX   rY   packed_modules_mappingr   rb   r&   r~   r   r   r[   r   rc  r\   rl  r   r   ru   r  r  r   r   s   @r`   rP  rP    s       -Y/GH 37#C #C #C ,#C )4/	#C
 #C 
#C #C #C #C #C #CJ ;u{ ; ; ; X; < < < < X<5< EL    655 \5 
	5 5 5 5n HU33D-E$F  3s8                r_   rP  c                        e Zd Zdddedef fdZdej        dej        fdZ	 	 ddej        d
ej        de	d	z  dej        d	z  dej        f
dZ
deeeej        f                  dee         fdZ xZS )
MolmoModelr   r   vllm_configr   c                D   t                                                       |j        j        |j        |j        | _        j        pj        | _        | xj        t          z  c_        t          | j        j                  | _        j        rt          nt          t!          j        fd| d          \  | _        | _        | _        j        dk    sJ t-          j        j                  | _        t3          ddgj                  | _        d S )N)r   c                 "     |           S )Nr   r^   )r   r
  r   decoder_layerr   s    r`   <lambda>z%MolmoModel.__init__.<locals>.<lambda>X  s!    ==l6   r_   z.layersr   rB  r   rH  )r   r   model_config	hf_configr
  r   r   embedding_size
vocab_sizeADDITIONAL_VOCAB_SIZEr)   r   embed_tokens
norm_afterrM  r@  rG   num_hidden_layersstart_layer	end_layerlayersrE  r    r  normrF   make_empty_intermediate_tensors)rw   r  r   r
  r   r  r   r   s      @@@@r`   r   zMolmoModel.__init__B  sT   )3"/"/$3Hv7H442%
 
 
 +1*;R&&AR 	 9D$       %%%9
 9
 9
5$.$+ %....F.0EFF	/Vj)6+=0
 0
,,,r_   	input_idsr   c                 ,    |                      |          S rt   )r  )rw   r  s     r`   embed_input_idszMolmoModel.embed_input_idse  s      +++r_   Nr%  intermediate_tensorsinputs_embedsc                    t                      j        r||}n|                     |          }d }n|J |d         }|d         }t          | j        | j        | j                  D ]} ||||          \  }}t                      j        st          ||d          S || 	                    ||          \  }}n| 	                    |          }|S )Nr   rH  )r   rH  )
r   is_first_rankr  r	   r  r  r  is_last_rankr:   r  )	rw   r  r%  r  r  r   rH  rk  r   s	            r`   r   zMolmoModel.forwardh  s    >>' 		8( - $ 1 1) < <HH'3330AM+J7H DK)94>JJ 	 	E&+e' '#M88
 ~~* 	&"/XFF   #yyAAM11 IIm44Mr_   r|  c                 H   t          |                                           }t                      }|D ]o\  }}|                    d          r||vrt	          ||           r0||         }t          |dt                    } |||           |                    |           p|S )Nr~  r  )r  r  r  r  rE   r  r*   r  )rw   r|  r  r  r  r  r  r  s           r`   r  zMolmoModel.load_weights  s    4002233"%%%#* 		$ 		$D-}}W%% $k*A*A&tT22 %E#E?<QRRMM%///d####r_   NN)rW   rX   rY   r   r~   r   r[   r\   r  r:   r   r   ru   r  r  r   r   s   @r`   r  r  @  s       AC !
 !
 !
z !
3 !
 !
 !
 !
 !
 !
F, ,%, , , , , <@-1! !<! <! 2D8	!
 |d*! 
! ! ! !FHU33D-E$F 3s8        r_   r  r   r  c                     | |z  |z  S rt   r^   )r   r  s     r`   _lowest_multipler    s    Fa<r_   	num_tilescrop_patchesleft_marginright_marginpooling_sizec                    | dk    rt          ||z   dz
  |          S |||z   z
  }t          ||z   |z   dz
  |          }t          ||z   dz
  |          }t          ||z   |z   dz
  |          }|| dz
  |z  z   |z   S )Nr=   rK   )r  )	r  r  r  r  r  crop_window_patchesleft_num
middle_num	right_nums	            r`   get_num_patchesr    s     A~~| ;a ?NNN&+*DEk)L81< H "l*Q. J !l*\9A= I
 y1}
22Y>>r_   tiling_htiling_wc                 Z    t          | ||||          }t          |||||          }||fS )N)r  r  r  r  )r  )r  r  r  r  r  r  nrowsncolss           r`   get_patches_grid_sizer    sX     !!!  E !!!  E %<r_   max_numc                 f      fdt          d dz             D             }t          |d           S )Nc                 V    g | ]%}t          d d z             D ]}||z  k    ||f&S )r=   )r   )r   r   jr  s      r`   r   z)get_candidate_tilings.<locals>.<listcomp>  s[       q'A+&&  q5G 
A r_   r=   c                 $    | d         | d         z  S )Nr   r=   r^   )r   s    r`   r  z'get_candidate_tilings.<locals>.<lambda>  s    1! r_   )key)r   sorted)r  tilingss   ` r`   get_candidate_tilingsr    sR       q'A+&&  G '445555r_   heightwidth
patch_sizemax_num_patchesc                    t          |          }t          j        |t          j                  }||z  }t          j        | |gt          j                  }|                    t          j                  |z  }|                    dd          }	|	dk                                     r|	                                }
n,t          j	        |	dk     d|	          
                                }
||
         S )Nrn  r   T)axiskeepdimsr=   g      ?g    _B)r  rO   arrayint32rs  astypeminrf  argmaxwhereargmin)r  r  r  r  r  candidate_tilingscandidate_resolutionsoriginal_sizerequired_scale_drequired_scaleixs              r`   select_tilingr    s     $O44G999-
:Hfe_BJ???M,33BJ??-O%))rD)AAN!! K""$$Xns*D.AAHHJJR  r_   c            	           e Zd ZdZdef fdZedeee	f         fd            Z
ede	fd            Zedee	e	f         fd            Zede	fd            Zedee	e	f         fd	            Zede	fd
            Zede	fd            Zededz  fd            Zedefd            Zede	fd            Zede	fd            Zede	fd            Zede	fd            Zede	fd            Zde	de	dee	e	f         fdZde	de	dee	e	f         fdZ	 	 	 ddeee         z  dz  deee         z  dz  dee z  dz  de!fdZ" xZ#S )MolmoProcessorWrapperz
    Wraps `MolmoProcessor` so that it can be called directly.

    The original definition can be found here:
    https://huggingface.co/allenai/Molmo-7B-D-0924/blob/main/preprocessing_molmo.py
    	processorc                 V    t                                                       || _        d S rt   )r   r   r  )rw   r  r   s     r`   r   zMolmoProcessorWrapper.__init__  s$    "r_   r   c                 $    | j         j        j        S rt   )r  	tokenizervocabrv   s    r`   r  zMolmoProcessorWrapper.vocab  s    ~'--r_   c                 Z    | j         j        }|j        }t          |t                    sJ |S rt   )r  image_processor	max_crops
isinstancer}   )rw   r  r  s      r`   r  zMolmoProcessorWrapper.max_crops
  s0    .8#-	)S)))))r_   c                 x    | j         j        }|j        }t          |t                    r||fS t          |          S rt   )r  r  base_image_input_sizer  r}   ru   )rw   r  r  s      r`   r  z+MolmoProcessorWrapper.base_image_input_size  sD    .8 / E+S11 	@(*???*+++r_   c                 Z    | j         j        }|j        }t          |t                    sJ |S rt   )r  r  rf   r  r}   )rw   r  rf   s      r`   rf   z&MolmoProcessorWrapper.image_patch_size  s2    .8*;*C00000r_   c                     | j         j        }|j        \  }}t          |t                    sJ t          |t                    sJ ||fS rt   )r  r  overlap_marginsr  r}   )rw   r  r  r  s       r`   r  z%MolmoProcessorWrapper.overlap_margins&  sP    .8$3$C!\+s+++++,,,,,,L((r_   c                 Z    | j         j        }|j        }t          |t                    sJ |S rt   )r  r  image_token_length_wr  r}   )rw   r  r  s      r`   r  z*MolmoProcessorWrapper.image_token_length_w0  2    .8.C.44444##r_   c                 Z    | j         j        }|j        }t          |t                    sJ |S rt   )r  r  image_token_length_hr  r}   )rw   r  r  s      r`   r  z*MolmoProcessorWrapper.image_token_length_h9  r  r_   Nc                     dS )Nroler^   rv   s    r`   message_formatz$MolmoProcessorWrapper.message_formatB  s    vr_   c                     dS )NTr^   rv   s    r`   always_start_with_spacez-MolmoProcessorWrapper.always_start_with_spaceF  s    tr_   c                 &    | j         t                   S rt   )r  IMAGE_PATCH_TOKENrv   s    r`   image_patch_idz$MolmoProcessorWrapper.image_patch_idJ  s    z+,,r_   c                 &    | j         t                   S rt   )r  IM_COL_TOKENrv   s    r`   	im_col_idzMolmoProcessorWrapper.im_col_idN      z,''r_   c                 &    | j         t                   S rt   )r  IM_START_TOKENrv   s    r`   im_start_idz!MolmoProcessorWrapper.im_start_idR  s    z.))r_   c                 &    | j         t                   S rt   )r  IM_END_TOKENrv   s    r`   	im_end_idzMolmoProcessorWrapper.im_end_idV  r  r_   c                     t           S rt   )rW  rv   s    r`   r  z"MolmoProcessorWrapper.pooling_sizeZ  s    r_   image_widthimage_heightc                    | j         }| j        \  }}| j        }| j        }|||z   z  }|d         |z  }	|	||z   z
  }
|
|z  }t	          ||z
  ||z
  ||          \  }}||fS )Nr   )r  r  r  r  )r  r  r  rf   r  )rw   r	  r
  r  r  r  r  base_image_input_dtotal_margin_pixelsr  r  crop_window_sizer  r  s                 r`   r  z#MolmoProcessorWrapper.select_tiling^  s     N	$($8!\ $ :!20L;4NO,Q/3EE*l[.HI.1CC*"55 33'%	
 
 
( !!r_   c                    | j         \  }}| j        }| j        }| j        }|d         |z  }|                     ||          \  }	}
t          |
|	||||          \  }}||fS )Nr   )r
  r	  )r  r  r  r  r  r  )r  r  rf   r  r  r  )rw   r	  r
  r  r  r  r  r  r  r  r  r  r  s                r`   r  z+MolmoProcessorWrapper.get_patches_grid_sizev  s     %)$8!\ $ :!2(,Q/3EE!//%# 0 
 
(
 -%#%%
 
 
u e|r_   textrQ   return_tensorsc                       j         j        ||fi |}|g }t          |t                    s|g}|                    d          }|                    d          |d<   |                    dd           }|y|dk    } fd|D             }	t          j        |	                              d          dz   }
|
	                                t          |          k    sJ ||d<   |
|d<    j        |d<   t          |          S )	Nr  r   rT   c                 j    g | ]/}                     |j        d          |j        d                   0S )r   r=   r	  r
  )r  r   )r   imagerw   s     r`   r   z2MolmoProcessorWrapper.__call__.<locals>.<listcomp>  sR       
 	 "" %
1!&A #    r_   r   r=   rV   img_patch_id)r  processr  r   popr   r[   tensorprodsumrZ  r  r   )rw   r  rQ   r  kwargsoutputsr  rT   feat_is_patchr  rV   s   `          r`   __call__zMolmoProcessorWrapper.__call__  s>    )$.(&
 
"
 
 >F&$'' 	XF")++k":":	(22155!++&7>>&+q0M   
 $  G W--22266:I==??c-&8&88888)8G%&#,GK &*&9GN#G$$$r_   )NNN)$rW   rX   rY   rZ   r   r   r   r  r~   r}   r  r  ru   r  rf   r  r  r  r   r  r   r  r  r   r  r  r  r  r  r   r   r   r   r   r  r   r   s   @r`   r  r    sq        #. # # # # # #
 .tCH~ . . . _. 3    _ ,uS#X , , , _,  #       _  )sCx ) ) ) _) $c $ $ $ _$ $c $ $ $ _$ d
    X     X - - - - _- (3 ( ( ( _( *S * * * _* (3 ( ( ( _( c    X" " 	"
 
sCx" " " "0  	
 
sCx   < 487;26	&% &%$y/)D0&% T*--4&% j(4/	&% 
&% &% &% &% &% &% &% &%r_   r  c                   j    e Zd ZdedefdZdeeedz  f         fdZ	dedededz  defd	Z
defd
ZdS )MolmoProcessingInfor  r   c                 D     | j         j        di |}t          |          S Nr^   )ctxget_hf_processorr  )rw   r  r  s      r`   r%  z$MolmoProcessingInfo.get_hf_processor  s*    -DH-7777	$Y///r_   Nc                 
    dd iS )Nr  r^   rv   s    r`   get_supported_mm_limitsz+MolmoProcessingInfo.get_supported_mm_limits  s    r_   r	  r
  r  c                    ||                                  }|                    ||          \  }}|j        }|j        }|j        }d|dz   |z  z   }	d|dz   |z  dz   |dz   |z  z  z   }
|	|
z   S )Nr  rK   r=   )r%  r  r  r  r  )rw   r	  r
  r  r  r  r  r  r  extrajoints              r`   get_num_image_tokensz(MolmoProcessingInfo.get_num_image_tokens  s     --//I 66#% 7 
 
u !-(=(= )A-1EEEeaiL014%!)9TUUu}r_   c                 (   |                                  }t          |j                  }|j        \  }}d\  }}|D ]@\  }}||z  ||z  }
}	|                     |	|
|          }||k    r|}t          |	|
          }A|dk    s|t          d          |S )N)r   N)r	  r
  r  )r  r  r   z(Cannot have a largest feature size of 0!)r%  r  r  r  r+  r1   
ValueError)rw   r  r  base_hbase_wlargest_feature_sizelargest_feature_pinpointwrhrr  r  	feat_sizes               r`   !get_image_size_with_most_featuresz5MolmoProcessingInfo.get_image_size_with_most_features  s    ))++	'	(;<<"89@66 
	Q 
	QFB"RK"6E11!## 2  I
 ///'0$+45+P+P+P(1$$(@(HGHHH''r_   )rW   rX   rY   objectr  r%  r   r~   r}   r'  r+  r1   r5  r^   r_   r`   r!  r!    s        0 04I 0 0 0 0cDj)A      	
 )4/ 
   2(9 ( ( ( ( ( (r_   r!  c            	       p    e Zd Zdeeef         defdZ	 ddedeeef         deeef         dz  defdZ	dS )	MolmoDummyInputsBuilder	mm_countsr   c                     dS r   r^   )rw   r9  s     r`   get_dummy_textz&MolmoDummyInputsBuilder.get_dummy_text  s    rr_   Nseq_len
mm_optionsc                     | j                                         \  }}|                    dd          }|r|                    d          nd }d|                     ||||          iS )Nr  r   )r  r  
num_images	overrides)infor5  get_get_dummy_images)rw   r<  r9  r=  target_widthtarget_heightr?  image_overridess           r`   get_dummy_mm_dataz)MolmoDummyInputsBuilder.get_dummy_mm_data  s|     '+i&Q&Q&S&S#m]]7A..
5?I*..111T T++"$%)	 ,  
 	
r_   rt   )
rW   rX   rY   r   r~   r}   r;  r   r-   rG  r^   r_   r`   r8  r8    s        S(9 c     =A	
 

 38$
 C!112T9	

 

 
 
 
 
 
r_   r8  c            	           e Zd Zdee         dee         fdZdedeee	f         deee
f         fdZdedeee	f         dedee         fd	Zd
S )MolmoMultiModalProcessorprompt_tokensr   c                 ~   | j                                         }|j                            | j                                                             |          d|j                  }| j         j                            |t          |                    }|
                    d                                          \  }|S )Nnone)r  r  )tokensr  )rA  r%  r  get_tokens_inputget_tokenizerdecoder  r$  call_hf_processorr  r  tolist)rw   rJ  r  rM  processed_data
prompt_idss         r`   _apply_hf_processor_tokens_onlyz8MolmoMultiModalProcessor._apply_hf_processor_tokens_only  s     I..00	
 $55I##%%,,];;!$-$E 6 
 
 88
 
 '**;77>>@@r_   	hf_inputshf_processor_mm_kwargsc           
      T   |                     dt          j        d                    }t          |          }t	          t          j        d|          t          j        d|          t          j        d|          t          j        d          t          j        d|                    S )NrV   r   r  )rQ   rR   rT   rV   r  )	rB  r[   emptyrZ  r  r.   flat_from_sizesbatchedshared)rw   rV  rW  rV   r?  s        r`   _get_mm_fields_configz.MolmoMultiModalProcessor._get_mm_fields_config'  s    
 MM+u{1~~>>	^^
(8)LL-=gyQQ1A'9UU+3G<<.5gzJJ
 
 
 	
r_   mm_itemsout_mm_kwargsc           	      8  	
  | j         j        di |j        }j        }j        j        j        	j        j        
g|z  	gz   }g||z  z   
gz   dt          f	
fd}t          dt          j        d          |          gS )Nitem_idxc                 $                        dt                    }|                    |           }                    |j        |j                  \  }}
g|dz   z  z  gz   }g||dz   z  z  z   	gz   }t          j        |z   
          S )Nr  r  r=   )embed_token_id)	get_itemsr0   get_image_sizer  r  r  r9   select_token_id)ra  rQ   
image_sizer  r  	joint_rowr*  extra_joint
img_col_id
img_end_idr  img_start_idr^  r  r  s          r`   get_insertion_molmozIMolmoMultiModalProcessor._get_prompt_updates.<locals>.get_insertion_molmoK  s    ''1DEEF..x88J$::&,'. ;  LE5
 &519*EF*UI	l:;<,  '6e#+   r_   r  z<|endoftext|>)modalitytarget	insertionr^   )rA  r%  r  r  r  r  r   r  r  r}   r7   r6   r   )rw   r^  rW  r_  r  r  	extra_rowrm  ri  rj  rk  r  rl  r  r  s    `      @@@@@@@r`   _get_prompt_updatesz,MolmoMultiModalProcessor._get_prompt_updates7  s    /DI.HH1GHH	(=(= - /(
 ,(
!N%99ZLH	#ny3G'GG:,V	# 	 	 	 	 	 	 	 	 	 	 	 	 	,  )0AA-  
 	
r_   N)rW   rX   rY   r   r}   rU  r   r   r~   r6  r.   r]  r2   r/   r   r8   rr  r^   r_   r`   rI  rI    s        Cy 
c   0

 !(V 4
 
++	,	
 
 
 
 /
%/
 !(V 4/
 -	/

 
,	/
 /
 /
 /
 /
 /
r_   rI  )rA  dummy_inputsc                       e Zd Z eddddddddd	d
ddddddd          ZdgdgddgdZededededz  fd            Z	ddde
d ef fd!Zd"ededz  fd#Zd$edeej                 fd%Zd"edefd&Z	 	 d1d'ej        d(ej        d)edz  d*ej        dz  d"edej        fd+Zd,ej        dej        fd-Zd.eeeej        f                  fd/Zdefd0Z xZS )2MolmoForCausalLMzimage_projector.gate_proj.zimage_projector.up_proj.zimage_projector.down_proj.zself_attn.qkv_projzself_attn.o_projzself_attn.q_normzself_attn.k_normzmlp.gate_up_projzmlp.down_projrF  rG  )zimage_projector.w1.zimage_projector.w3.zimage_projector.w2.att_projattn_outr  r  ff_projff_out	attn_normff_normzvision_backbone.zmodel.layers.zmodel.norm.zlm_head.)zmodel.vision_backbone.zmodel.transformer.blocks.zmodel.transformer.ln_f.z model.transformer.mlp.down_proj.)orig_to_new_substrorig_to_new_prefixr  r2  rQ  rR  )r  r2  r=  rn  r   r   Nc                 N    |                     d          rd S t          d          )Nr  z Only image modality is supported)
startswithr-  )clsrn  r   s      r`   get_placeholder_strz$MolmoForCausalLM.get_placeholder_str  s,    w'' 	4;<<<r_   r   r   r  r   c          
      ,   t                                                       |j        j        }|j        }|j        j        }|| _        || _        t                      }|                     |d          5  t          |||t          |d                    | _        d d d            n# 1 swxY w Y   |                     |          5  t          |t          |d                    | _        d d d            n# 1 swxY w Y   d | _        | j        j        r| j        j        j        | _        n7t)          |j        p|j        |j        |t          |d                    | _        t1          |j        p|j                  | _        | j        j        | _        d S )Nr  vision_backboner   model)r  r   lm_headr   )r   r   r  r  r   multimodal_configr   rb   _mark_tower_modelrP  rH   r  _mark_language_modelr  r  r  weight_tyingr   wter  r(   r  r  r   r%   logits_processorr  )rw   r  r   r   r   r  rS  r   s          r`   r   zMolmoForCausalLM.__init__  s   )3"/'4F!2,..##K99 	 	#6#F,=>>	$ $ $D 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 &&{33 	 	#'VW0M0M  DJ	 	 	 	 	 	 	 	 	 	 	 	 	 	 	
 !;# 	:15DLL)%:):")#FI66	  DL !0!6V%6!
 !

 J6 	,,,s$   3'B&&B*-B*%C77C;>C;r  c                    |                     dd           }|                     dd           }|                     dd           }|                     dd           }|d S |                     dd           }t          |t          j                  r|                                }t          |t
                    sJ || _        t          ||||          S )NrQ   rR   rT   rV   r  )rQ   rR   rT   rV   )r  r  r[   r\   itemr}   r  rM   )rw   r  rQ   rR   rT   rV   r  s          r`   _parse_and_validate_image_inputz0MolmoForCausalLM._parse_and_validate_image_input  s     Hd++jj55 **%6==JJ{D11	>4zz.$77lEL11 	/',,..L,,,,,,(#+	
 
 
 	
r_   image_inputc                    |d         }|d         }|d         }|d         }|                      |                    d          |d n|                    d                                        d          }g }|                                }t	          |                    |          |                    |                    D ]H\  }	}
|
dk    }|
|         }t          j        |          }|                    |	|         |                    I|S )NrQ   rR   rT   rV   r   )rQ   rR   )	r  r   r   rR  zipr'  r[   argsortr   )rw   r  rQ   rR   rT   rV   ri  resultsnum_crops_listfeatsimg_idxis_validvalid_img_idxorders                 r`   _process_image_inputz%MolmoForCausalLM._process_image_input  s"    X&!-0%&78,	 --##A&& + 39N9Nq9Q9Q . 
 
 '!** 	 "))++!  00!!.11
 
 	3 	3NE7 !|H#H-MM-00ENN5?512222r_   c                 N     | j         di |}|g S |                     |          S r#  )r  r  )rw   r  r  s      r`   embed_multimodalz!MolmoForCausalLM.embed_multimodal  s9    :d:DDVDDI((555r_   r  r%  r  r  c                 @    |d }|                      ||||          }|S )N)r  )r  )rw   r  r%  r  r  r  r   s          r`   r   zMolmoForCausalLM.forward  s9      + M

y"6m # 
 
 r_   r   c                 <    |                      | j        |          }|S rt   )r  r  )rw   r   logitss      r`   compute_logitszMolmoForCausalLM.compute_logits  s    &&t|]CCr_   r|  c                 v    t          |           }t          |          }|                    || j                  S )N)mapper)rC   "_get_weights_with_merged_embeddingr  hf_to_vllm_mapper)rw   r|  loaders      r`   r  zMolmoForCausalLM.load_weights  s8    "4((4W==""743I"JJJr_   c                 0    t          j        ddd          S )z<
        Get the module prefix in multimodal models
        r  zvision_backbone.image_projectorr  )language_model	connectortower_model)r+   from_string_fieldrv   s    r`   get_mm_mappingzMolmoForCausalLM.get_mm_mapping  s'     /"7)
 
 
 	
r_   r  )rW   rX   rY   rD   r  r  classmethodr~   r}   r  r   r   r6  rM   r  r   r[   r\   r  r>   r  
LongTensorr:   r   r  r   ru   r  r+   r  r   r   s   @r`   ru  ru  i  sv        & $@#=#?,*(()%*1
 
" '9)8'4 1;	
 	
!  :  L'(%y1  =3 =3 =3: = = = [= BD *
 *
 *
z *
3 *
 *
 *
 *
 *
 *
X

 
D	 
 
 
 
4% 
el	   :6 64H 6 6 6 6 <@-1 # # 2D8	
 |d*  
   "EL U\    KHU33D-E$F K K K K

 
 
 
 
 
 
 
 
r_   ru  r|  c              #      K   i }| D ]\  }}d|v r||d<   d|v r||d<   ||fV   t          j        |d         |d         gd          }d|fV  d S )Nzwte.embedding	embeddingzwte.new_embeddingnew_embeddingr   r   zmodel.embed_tokens.weight)r[   r   )r|  embedding_weightsr  ra  s       r`   r  r  )  s        ! !fd""-3k** D((17o...     		;	'):?)KL   '(9
::::::r_   )r   collections.abcr   r   r   dataclassesr   	functoolsr   r   	itertoolsr	   typingr
   numpyrO   r[   torch.nnr   torch.nn.functional
functionalr   einopsr   transformersr   r   r   r   transformers.image_utilsr   $transformers.tokenization_utils_baser   vllm.attention.layerr   vllm.compilation.decoratorsr   vllm.configr   r   vllm.config.multimodalr   vllm.distributedr   r   r   r   r   %vllm.model_executor.layers.activationr   r   r   9vllm.model_executor.layers.attention.mm_encoder_attentionr   $vllm.model_executor.layers.layernormr    !vllm.model_executor.layers.linearr!   r"   r#   r$   +vllm.model_executor.layers.logits_processorr%   'vllm.model_executor.layers.quantizationr&   +vllm.model_executor.layers.rotary_embeddingr'   3vllm.model_executor.layers.vocab_parallel_embeddingr(   r)   -vllm.model_executor.model_loader.weight_utilsr*   )vllm.model_executor.models.module_mappingr+   vllm.multimodalr,   vllm.multimodal.inputsr-   r.   r/   vllm.multimodal.parser0   r1   r2   vllm.multimodal.processingr3   r4   r5   r6   r7   r8   r9   vllm.sequencer:   vllm.utils.tensor_schemar;   r<   
interfacesr>   r?   r@   rA   rB   utilsrC   rD   rE   rF   rG   rH   rU  r   r  r  r  r  r  rW  rM   rb   Moduler   r   r   r   r\   r}   r   r   r	  r-  r;  r@  rM  rP  r  r  r  ru   r  r   r  r  r  r!  r8  rI  register_processorru  r~   r  r^   r_   r`   <module>r     s
    7 7 7 7 7 7 7 7 7 7 ! ! ! ! ! ! . . . . . . . .                                       S S S S S S S S S S S S / / / / / / : : : : : : * * * * * * = = = = = = / / / / / / / / 3 3 3 3 3 3              T S S S S S S S S S X X X X X X 8 8 8 8 8 8            H G G G G G F F F F F F @ @ @ @ @ @        P O O O O O D D D D D D / / / / / /         
 V U U U U U U U U U                  . - - - - - > > > > > > > >                             "X
    : : : : :| : : :( F F F F F F F F,         RY      FR R R R R29 R R Rj    RY   >    bi   6; ;3 ;5< ; ; ; ;O O O O O	 O O Odh h h h hRY h h hV' ' ' ' 'ry ' ' 'T( ( ( ( (	 ( ( (V-' -' -' -' -'	 -' -' -'`' ' ' ' '!2 ' ' '2b b b b b")] b b bJ X X X X XM X X Xv      ?? ? 	?
 ? ? 	? ? ? ?:  	
    38_   663 64c3h+@ 6 6 6 6!! ! 	!
 ! ! ! !.% % % % % % % %D7( 7( 7( 7( 7(, 7( 7( 7(t
 
 
 
 
45HI 
 
 
2X
 X
 X
 X
 X
67JK X
 X
 X
v ('	(  
x
 x
 x
 x
 x
I!:|]x
 x
 
x
v;eC-./;eC%&'; ; ; ; ; ;r_   