
    .`i|                        d dl Z d dlmZmZmZ d dlmZmZ d dlZd dl	Z
d dlZd dlmZ d dlmZ d dlmZ d dlmZmZ d dlmZ d dlmZmZmZmZ d d	lmZmZ d d
l m!Z! d dl"m#Z# d dl$m%Z% d dl&m'Z' d dl(m)Z)m*Z* d dl+m,Z, d dl-m.Z. d dl/m0Z0m1Z1 d dl2m3Z3 d dl4m5Z5m6Z6 d dl7m8Z8 d dl9m:Z:m;Z; d dl<m=Z= d dl>m?Z?m@Z@mAZA d dlBmCZC d dlDmEZEmFZFmGZGmHZHmIZI d dlJmKZK d dlLmMZM d dlNmOZOmPZP d dlQmRZR  e%eS          ZTdZU G d d e1          ZV G d! d"e0          ZW G d# d$ejX                  ZY G d% d&eY          ZZ G d' d(ejX                  Z[ G d) d*eO          Z\ G d+ d,          Z] G d- d.          Z^ G d/ d0eF          Z_ G d1 d2eEe_                   Z` G d3 d4eGe_                   Za G d5 d6ejX                  Zb e=jc        eae_e`7           G d8 d9ejX        e6                      ZddS ):    N)IterableMappingSequence)	AnnotatedLiteral)	rearrange)Image)OPENAI_CLIP_MEANOPENAI_CLIP_STD)
transforms)
BartConfigBatchFeaturePretrainedConfig
TensorType)CacheConfig
VllmConfig)
LoRAConfig)BaseDummyOptions)init_logger)
get_act_fn)ColumnParallelLinearRowParallelLinear)LogitsProcessor)QuantizationConfig)ParallelLMHeadVocabParallelEmbedding)default_weight_loader)MultiModalEmbeddingsSupportsMultiModal)
RadioModel)WhisperAttentionWhisperCrossAttention)MULTIMODAL_REGISTRY)MultiModalDataDictMultiModalFieldConfigMultiModalKwargsItems)MultiModalDataItems)BaseDummyInputsBuilderBaseProcessingInfoEncDecMultiModalProcessorPromptReplacementPromptUpdate)RadioConfig)TokenizerLike)TensorSchemaTensorShape)AttentionType)i   ip  c                   \     e Zd ZdZ	 d
dededef fdZdej        dej        f fd	Z	 xZ
S )BartScaledWordEmbeddingzj
    This module overrides VocabParallelEmbedding's
    forward by multiplying with embeddings scale.
          ?num_embeddingsembedding_dimembed_scalec                 Z    t                                          ||           || _        d S Nsuper__init__r7   selfr5   r6   r7   	__class__s       }/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/vllm/model_executor/models/nemotron_parse.pyr<   z BartScaledWordEmbedding.__init__M   -     	777&    	input_idsreturnc                 V    t                                          |          | j        z  S r9   r;   forwardr7   r>   rC   r?   s     r@   rG   zBartScaledWordEmbedding.forwardS   !    wwy))D,<<<rB   r4   __name__
__module____qualname____doc__intfloatr<   torchTensorrG   __classcell__r?   s   @r@   r3   r3   G   s          MP' '!'25'DI' ' ' ' ' '= =%, = = = = = = = = = =rB   r3   c                   \     e Zd ZdZ	 d
dededef fdZdej        dej        f fd	Z	 xZ
S )BartParallelLMHeadz
    This module overrides ParallelLMHead's
    forward by dividing by embeddings scale,
    yielding effectively the inverse of
    BartScaledWordEmbedding
    r4   r5   r6   r7   c                 Z    t                                          ||           || _        d S r9   r:   r=   s       r@   r<   zBartParallelLMHead.__init___   rA   rB   rC   rD   c                 V    t                                          |          | j        z  S r9   rF   rH   s     r@   rG   zBartParallelLMHead.forwarde   rI   rB   rJ   rK   rU   s   @r@   rW   rW   W   s          MP' '!'25'DI' ' ' ' ' '= =%, = = = = = = = = = =rB   rW   c            	            e Zd Z	 	 	 ddededz  dedz  def fdZ	 ddej	        d	ej	        dz  d
ej	        fdZ
 xZS )BartDecoderLayerN configcache_configquant_configprefixc           	         t                                                       |j        | _        t	          | j        |j        t          j        ||| d          | _        t          |j
                  | _        t          j        | j                  | _        	 t          | j        |j        ||| d          | _        t          j        | j                  | _        | j        }|j        }d}t'          ||||| d          | _        t+          ||||| d          | _        t          j        | j                  | _        d S )	Nz
.self_attn)	embed_dim	num_heads	attn_typer^   r_   r`   z.encoder_attnr^   r_   r`   Tz.fc1)biasr_   r`   z.fc2)r;   r<   d_modelrb   r!   decoder_attention_headsr1   DECODER	self_attnr   activation_functionactivation_fnnn	LayerNormself_attn_layer_normr"   encoder_attnencoder_attn_layer_normencoder_ffn_dimr   fc1r   fc2final_layer_norm)	r>   r]   r^   r_   r`   ffn_hidden_sizeffn_intermediate_sizeffn_has_biasr?   s	           r@   r<   zBartDecoderLayer.__init__j   s`    	)n4#+%%(((
 
 
 ((BCC$&L$@$@!	
 2N*%%+++
 
 
 (*|DN'C'C$. & 6'!%???
 
 
 %!%???
 
 
 !#T^ < <rB   decoder_hidden_statesencoder_hidden_statesrD   c                    |}|                      |          }||z   }|                     |          }|}|                     ||          }||z   }|                     |          }|}|                     |          \  }}|                     |          }|                     |          \  }}||z   }|                     |          }|S )z
        Args:
            decoder_hidden_states: torch.Tensor of *decoder* input embeddings.
            encoder_hidden_states: torch.Tensor of *encoder* input embeddings.
        Returns:
            Decoder layer output torch.Tensor
        hidden_statesr}   rz   )rj   ro   rp   rq   rs   rl   rt   ru   r>   ry   rz   residualr}   fc1_out_s          r@   rG   zBartDecoderLayer.forward   s     ) 5JKK =011-@@ !))'"7 * 
 

 !=044]CC !XXm,,
**73388M22q =0--m<<rB   )NNr\   r9   )rL   rM   rN   r   r   r   strr<   rR   rS   rG   rT   rU   s   @r@   r[   r[   i   s         ,0265= 5=5= "D(5= )4/	5=
 5= 5= 5= 5= 5= 5=t 6:* *$|*  %|d2* 
	* * * * * * * *rB   r[   c                   J    e Zd Z	 ddej        dej        dz  dej        fdZdS )MBartDecoderLayerNry   rz   rD   c                    |}|                      |          }|                     |          }||z   }|}|                     |          }|                     ||          }||z   }|}|                     |          }|                     |          \  }}|                     |          }|                     |          \  }}||z   }|S )Nr|   r~   )ro   rj   rq   rp   ru   rs   rl   rt   r   s          r@   rG   zMBartDecoderLayer.forward   s    
 )112GHH ]CC =0 !44]CC))'"7 * 
 

 !=0 !--m<<XXm,,
**73388M22q =0rB   r9   )rL   rM   rN   rR   rS   rG    rB   r@   r   r      sY         6:# #$|#  %|d2# 
	# # # # # #rB   r   c                        e Zd ZdZ	 	 	 	 	 ddededz  dedz  dedz  dej	        dz  d	e
f fd
Zdddej        dej        dz  dej        dz  dej        fdZdeee
ej        f                  dee
         fdZ xZS )MBartDecoderNoPosz
    Transformer decoder consisting of *config.decoder_layers* layers.
    Each layer is a [`BartDecoderLayer`]
    Args:
        config: BartConfig
        embed_tokens (nn.Embedding): output embedding
    Nr\   r]   r^   r_   lora_configembed_tokensr`   c                    t                                                       | _        | _        || _        j        rt          j        j                  nd}t          j
        j        |          | _        ||j        | j        _        t          j        fdt          j                  D                       | _        t          j        j                  | _        t          j        j                  | _        d S )Nr4   )r7   c           
      @    g | ]}t           d |           S )z.layers.r`   )r   ).0	layer_idxr^   r]   r`   r_   s     r@   
<listcomp>z.MBartDecoderNoPos.__init__.<locals>.<listcomp>  sT         "  $99i99	    rB   )r;   r<   r^   r_   r   scale_embeddingmathsqrtrg   r3   
vocab_sizer   weightrm   
ModuleListrangedecoder_layerslayersrn   layernorm_embedding
layer_norm)	r>   r]   r^   r_   r   r   r`   r7   r?   s	    ```  ` r@   r<   zMBartDecoderNoPos.__init__   s    	((&393IRdi///s3v~;
 
 
 #'3':D$m       "'v'<!=!=  

 

 $&<#?#? ,v~66rB   )inputs_embedsdecoder_input_idsrz   r   rD   c                    ||                      |          }|                     |          }| j        D ]} |||          }|                     |          }|S )a>  
        Args:
            decoder_input_ids: Indices of *decoder* input sequence tokens in the
                vocabulary. Padding will be ignored by default should you provide it.
            encoder_hidden_states: Tensor of encoder output embeddings
        Returns:
            Decoder output torch.Tensor
        N)ry   rz   )r   r   r   r   )r>   r   rz   r   kwargsr}   decoder_layers          r@   rG   zMBartDecoderNoPos.forward#  s{        --.?@@M00?? "[ 	 	M)M&3&;  MM
 66rB   weightsc                    g d}t          |                                           }t                      }|D ]\  }}|                    d          r|D ]X\  }}}	||vr|                    ||          }|                    d          r||vr;||         }
|
j        } ||
||	            nD|                    d          r||vr||         }
t          |
dt                    } ||
|           |	                    |           |S )N)).self_attn.qkv_projz.self_attn.q_projq)r   z.self_attn.k_projk)r   z.self_attn.v_projv).encoder_attn.kv_projz.encoder_attn.k_projr   )r   z.encoder_attn.v_projr   embed_positionsz.biasweight_loader)
dictnamed_parametersset
startswithreplaceendswithr   getattrr   add)r>   r   stacked_params_mappingparams_dictloaded_paramsnameloaded_weight
param_nameweight_nameshard_idparamr   s               r@   load_weightszMBartDecoderNoPos.load_weightsC  sU   "
 "
 "
 4002233"%%%#* 	$ 	$D-011 5K 4 41
Kd**||K<<==)) d+.E.E#D) % 3e]H=== ==)) d+.E.E#D) '@U V Ve]333d####rB   )NNNNr\   )rL   rM   rN   rO   r   r   r   r   rm   	Embeddingr   r<   rR   rS   rG   r   tupler   r   rT   rU   s   @r@   r   r      sA         ,026)-,0#7 #7#7 "D(#7 )4/	#7
  $&#7 lT)#7 #7 #7 #7 #7 #7 #7T .2   <  %|d2	
 |d* 
   @$HU33D-E$F $3s8 $ $ $ $ $ $ $ $rB   r   c                   f    e Zd ZU dZed         ed<   eej         e	dddd          f         ed<   d	S )
NemotronParsePixelInputszx
    Dimensions:
        - b: Batch size
        - c: Number of channels (3)
        - h: Height
        - w: Width
    pixel_valuestypeb   hwdataN)
rL   rM   rN   rO   r   __annotations__r   rR   rS   r0   r   rB   r@   r   r   j  sW           .
!!!!
EL++c1c3"?"??
@@@@@@rB   r   c                      e Zd ZdZefdefdZd Zdej	        dej	        fdZ
dej	        dej	        fdZd	ej        eej                 z  deeej        f         fd
Zd	ej        eej                 z  deeej        f         fdZdS )NemotronParseImageProcessorz'
    NemotronParse Image Processor
    
final_sizec                 T   t          |t          t          f          rCt          |          dk    r0t	          |d                   t	          |d                   f| _        nLt          |t          t          f          r$t	          |          t	          |          f| _        nt          | _        t          j	        t                                        dddd          | _        t          j	        t                                        dddd          | _        |                                  d S )N   r      r   )
isinstancelistr   lenrP   r   rQ   DEFAULT_FINAL_IMAGE_SIZErR   rS   r
   reshape	norm_meanr   norm_std_create_transforms)r>   r   r   s      r@   r<   z$NemotronParseImageProcessor.__init__|  s     j4-00 	7S__5I5I":a=113z!}3E3EFDOO
S%L11 	7":J@DOO6DO&677??1aKK_55==aAqII 	!!!!!rB   c           	      @   	 ddl }n"# t          $ r}t          d          |d}~ww xY wt          | j        t          t
          f          r@t          | j        d                   t          | j        d                   c| _        | _        n t          | j                  x| _        | _        |	                    |
                    | j        | j        t          j        g dd          g          | _        t          j	        t          j                    g          | _        dS )zCreate transform objects.r   NzyThe package `albumentations` is required to use NemotronParse model. Please install it with `pip install albumentations`.r   )   r   r   r4   )
min_height	min_widthborder_modefillp)albumentationsImportErrorr   r   r   r   rP   target_heighttarget_widthComposePadIfNeededcv2BORDER_CONSTANT	transformTToTensortorch_transform)r>   Aerrs      r@   r   z.NemotronParseImageProcessor._create_transforms  s4   	&&&&& 	 	 	#  		 doe}55 	JDOA&''DOA&'' 2D 1 1
 695I5IID!2#1"/ # 3(   

 

  !y
 
  
s    
&!&imagerD   c                    |j         dd         \  }}| j        }| j        }||z  }|}|}||k    r|}t          ||z            }||k    r|}t          ||z            }t	          j        |||ft          j                  S )z[Resize image maintaining aspect ratio (exact replica of original
        LongestMaxSizeHW).Nr   )interpolation)shaper   r   rP   r   resizeINTER_LINEAR)	r>   r   heightwidthmax_size_heightmax_size_widthaspect_ratio
new_height	new_widths	            r@   _resize_with_aspect_ratioz5NemotronParseImageProcessor._resize_with_aspect_ratio  s     BQB,* v~
	 O##(JJ566I ~%%&IY566J zIz*#:J
 
 
 	
rB   c                 Z   |j         dd         \  }}| j        | j        }}t          d||z
            }t          d||z
            }|dk    r|dk    r|S t	          |j                   dk    r t          j        |d|fd|fdfdd          }nt          j        |d|fd|ffdd          }|S )	zUPad image to target size with white padding (matches A.PadIfNeeded
        behavior).Nr   r   r   )r   r   constantr   )modeconstant_values)r   r   r   maxr   nppad)	r>   r   r   r   r   r   pad_hpad_wpaddeds	            r@   _pad_to_sizez(NemotronParseImageProcessor._pad_to_size  s     {2A21 $ 2D4EI
 AzA~&&Ay1}%%A::%1**L u{q  VUaZ0 #	  FF VE
QJ/jRU  F rB   imagesc                 l   t          |t                    s|g}g }|D ]E}t          |t          j                  rt          j        |          }|                    |           Fg }|D ]}|                     |          }| j        |                     |          }|d         }n|                     |          }| 	                    |          }|j
        d         dk    r|                    ddd          }|                    |           t          j        |          }|| j        z
  | j        z  }	d|	iS )	z
        Preprocess an image or batch of images for the NemotronParse model.

        Args:
            images: Input image(s)
        N)r   r   r   r   r   r   )r   r   r	   r   asarrayappendr   r   r  r   r   expandrR   stackr   r   )
r>   r  r   processed_imagesr   r   processed_imagetransformedpixel_values_tensornormalized_valuess
             r@   
preprocessz&NemotronParseImageProcessor.preprocess  se    &$'' 	XF  	+ 	+E%-- *
5))##E**** % 	5 	5E #<<UCCO ~)"nn?nCC"-g"6 #'"3"3O"D"D #'"6"6"G"G #(+q00&9&@&@B&K&K# 34444 {<00 *DN:dmK 122rB   c                      | j         |fi |S r9   )r  )r>   r  r   s      r@   __call__z$NemotronParseImageProcessor.__call__#  s     tv00000rB   N)rL   rM   rN   rO   r   r   r<   r   r   ndarrayr   r  r	   r   r   r   rR   rS   r  r  r   rB   r@   r   r   w  s         5" "" " " "&$
 $
 $
L
rz 
bj 
 
 
 
6"*     <33d5;//33 
c5<	 	33 33 33 33j1kD$551	c5<	 1 1 1 1 1 1rB   r   c            	            e Zd ZdZdededdf fdZddZ	 	 	 ddedz  d	e	j	        e
e	j	                 z  dz  d
eez  dz  defdZ xZS )NemotronParseProcessorz!
    NemotronParse Processor
    r]   	tokenizerrD   Nc                     t                                                       || _        || _        t	          |j                  | _        d S )N)r   )r;   r<   r]   r  r   
image_sizeimage_processor)r>   r]   r  r   r?   s       r@   r<   zNemotronParseProcessor.__init__.  sE     	":fFWXXXrB   c                 >    |g }t          |t                    s|g}|S r9   )r   r   )r>   
input_items     r@   _make_batch_inputz(NemotronParseProcessor._make_batch_input;  s,    J*d++ 	&$JrB   textr  return_tensorsc                       fd||fD             \  }}t          |          dk    ri n                     |          }  j        |fddi|}t          i |||          }|S )Nc                 :    g | ]}                     |          S r   )r  )r   xr>   s     r@   r   z3NemotronParseProcessor.__call__.<locals>.<listcomp>I  s'    JJJa..q11JJJrB   r   add_special_tokensF)r   tensor_type)r   r  r  r   )r>   r  r  r  r   image_inputstext_inputscombined_outputss   `       r@   r  zNemotronParseProcessor.__call__B  s     KJJJD&>JJJf [[A--rr43G3G3O3O$dnTNNeNvNN'0K0<0&
 
 
  rB   r9   )NNN)rL   rM   rN   rO   r   r.   r<   r  r   r	   r   r   r   r  rT   rU   s   @r@   r  r  )  s         Y Y !Y
 
Y Y Y Y Y Y     9=26	   Dj  d5;//$6  j(4/	  
               rB   r  c                       e Zd Zd ZdefdZedefd            Zde	e
edz  f         fdZdefdZded	e	e
ef         de	e
ef         dz  fd
ZdS )NemotronParseProcessingInfoc                 4    | j                                         S r9   )ctxget_hf_configr>   s    r@   r,  z)NemotronParseProcessingInfo.get_hf_configU  s    x%%'''rB   rD   c                      | j         j        t          f|                                 |                                 d|S )N)r]   r  )r+  init_processorr  r,  get_tokenizer)r>   r   s     r@   get_hf_processorz,NemotronParseProcessingInfo.get_hf_processorX  sP    &tx&"
%%''((**
 
 	
 
 	
rB   c                     dS )NTr   r-  s    r@   skip_prompt_length_checkz4NemotronParseProcessingInfo.skip_prompt_length_check`  s    trB   Nc                 
    ddiS )Nr   r   r   r-  s    r@   get_supported_mm_limitsz3NemotronParseProcessingInfo.get_supported_mm_limitsd  s    |rB   c                     |                                  }|j        }|j        j        }|d         |z  |d         |z  dz  z  dz   S )Nr   r      )r,  r  encoder
patch_size)r>   r]   r   r9  s       r@   get_num_image_tokensz0NemotronParseProcessingInfo.get_num_image_tokensg  sN    ##%%&
^.
1+A*1LQR0RSVWWWrB   seq_len	mm_countsc                 2    |                                  }d|iS )Nr   )r:  )r>   r;  r<  image_tokenss       r@   get_mm_max_tokens_per_itemz6NemotronParseProcessingInfo.get_mm_max_tokens_per_itemn  s     
 0022&&rB   )rL   rM   rN   r,  r  r1  propertyboolr3  r   r   rP   r5  r:  r?  r   rB   r@   r)  r)  T  s        ( ( (
,B 
 
 
 
 $    XcDj)A    Xc X X X X'' 38$' 
c	T	!	' ' ' ' ' 'rB   r)  c            	       p    e Zd Zdeeef         defdZ	 ddedeeef         deeef         dz  defdZ	dS )	NemotronParseDummyInputsBuilderr<  rD   c                     dS Nr\   r   )r>   r<  s     r@   get_dummy_textz.NemotronParseDummyInputsBuilder.get_dummy_textz  s    rrB   Nr;  
mm_optionsc                     |                     dd          }| j                                        j        \  }}d|                     |||          iS )Nr   r   )r   r   
num_images)getinfor,  r  _get_dummy_images)r>   r;  r<  rG  rI  r   r   s          r@   get_dummy_mm_dataz1NemotronParseDummyInputsBuilder.get_dummy_mm_data}  s_     ]]7A..
&*i&=&=&?&?&J#m T++"=Z ,  
 	
rB   r9   )
rL   rM   rN   r   r   rP   rF  r   r$   rM  r   rB   r@   rC  rC  w  s        S(9 c     =A	
 

 38$
 C!112T9	

 

 
 
 
 
 
rB   rC  c            
           e Zd Zdeee         z  dedeee         z  fdZdedeee	f         deee	f         deee	f         de
f
 fdZde
d	eee	f         deeef         fd
Zded	eee	f         dedee         fdZ xZS ) NemotronParseMultiModalProcessorpromptmm_datarD   c                     dgS )Nr   r   )r>   rP  rQ  s      r@   create_encoder_promptz6NemotronParseMultiModalProcessor.create_encoder_prompt  s    
 s
rB   	mm_kwargs
tok_kwargsc                     |r%t                                          ||||          }n.| j                                        }|j        } ||dd          }|S )NFpt)r#  r  )r;   _call_hf_processorrK  r1  r  )	r>   rP  rQ  rT  rU  processed_outputshf_processorr  r?   s	           r@   rX  z3NemotronParseMultiModalProcessor._call_hf_processor  sv      		 % : :J! !  95577L$.I )	5! ! ! ! rB   	hf_inputshf_processor_mm_kwargsc                 F    t          t          j        d                    S )Nr   )r   )r   r%   batched)r>   r[  r\  s      r@   _get_mm_fields_configz6NemotronParseMultiModalProcessor._get_mm_fields_config  s!    
 !6!>w!G!GHHHHrB   mm_itemsout_mm_kwargsc                 d    | j                                         }t          ddgdg|z            gS )Nr   r   )modalitytargetreplacement)rK  r:  r+   )r>   r`  r\  ra  num_image_tokenss        r@   _get_prompt_updatesz4NemotronParseMultiModalProcessor._get_prompt_updates  sI      999;;  sC"22  
 	
rB   )rL   rM   rN   r   r   rP   r$   rS  r   objectr   rX  r%   r_  r'   r&   r   r,   rg  rT   rU   s   @r@   rO  rO    s[       d3i $ 
tCy	   !! f%! 3;'	!
 CK(! 
! ! ! ! ! !&II !(V 4I 
++	,	I I I I
%
 !(V 4
 -	

 
,	
 
 
 
 
 
 
 
rB   rO  c                        e Zd ZdZ	 	 ddededz  def fdZ	 ddededz  d	efd
Z	de
j        d	e
j        fdZdeeee
j        f                  fdZ xZS )RadioWithNeckz2Vision encoder using RADIO model with custom neck.Nr\   r]   r_   r`   c                    t                                                       |j        | _        |                     ||          | _        d}t          j        d|d          | _        t          j	        |dd          | _
        t          j        ||ddd	d
          | _        t          j	        |dd          | _        t          d||| d          | _        t          j	        |dd          | _        d S )Nr_   i   i   r   gư>T)epselementwise_affine)r   r7  r   F)kernel_sizestridepaddingrf   i   z	.sum_proj)r_   r`   )r;   r<   r8  r]   get_vit_model_from_radio_configmodel_encoderrm   Conv1dconv1rn   layer_norm1Conv2dconv2layer_norm2r   sum_projlayer_norm3)r>   r]   r_   r`   last_hidden_stater?   s        r@   r<   zRadioWithNeck.__init__  s(    	n!AA B 
 

 !Yt%6::
<5T
 
 
 Y
 
 

 <5T
 
 
 -%'''	
 
 
 <5T
 
 
rB   	hf_configrD   c                     |j         }|j                            d          }|t          d|           t	          d||j        d|j        }t          ||          S )NmodelzUnsupported vit model type: )
model_namer  )r]   r_   r   )r8  argsrJ  
ValueErrorr-   r  r    )r>   r}  r_   hf_config_visionr  radio_configs         r@   rr  z-RadioWithNeck.get_vit_model_from_radio_config  s    
 %,%*..w77
HJHHIII" 
! +
 
 #
 
 LIIIIrB   r   c                 z   |                      |          \  }}|                     |                    ddd                                        ddd          }|                     |          }| j        j        }t          |d|j        d         |z  |j        d         |z            }|                     |          }t          |d          }| 	                    |          }| 
                    |                     |          d                   }t          j        ||                    d          fd	          }|S )
Nr   r   r   zb (h w) d -> b d h wr  r   r   zb d h w -> b (h w) ddim)rs  ru  permuterv  r]   r9  r   r   rx  ry  r{  rz  rR   cat	unsqueeze)r>   r   r   summaryfeatureoutputr9  s          r@   rG   zRadioWithNeck.forward   s*   --l;;GOOAq!4455==aAFF!!&))[+
" $
2 $
2	
 
 
 F##6#9::!!&))""4==#9#9!#<==FG$5$5a$8$89qAAArB   r   c                    g }d t          |                                                                           D             }|D ]\  }}|                    d          rF|                    d                    |                    d          dd                    |f           `||         }t          j                    5  t          ||           d d d            n# 1 swxY w Y   | j
                            |           d S )Nc                 D    i | ]\  }}|                     d           ||S )rs  r   )r   r   r   s      r@   
<dictcomp>z.RadioWithNeck.load_weights.<locals>.<dictcomp>  s@     
 
 
e???33
%
 
 
rB   rs  .r   )r   r   itemsr   r	  joinsplitrR   no_gradr   rs  r   )r>   r   model_encoder_weightsadaptor_dictr   r   r   s          r@   r   zRadioWithNeck.load_weights  sF    "
 
#D$9$9$;$;<<BBDD
 
 

  	4 	4GD!// 4%,,chhtzz#qrr7J.K.KQ-OPPPP$T*]__ 4 4)%3334 4 4 4 4 4 4 4 4 4 4 4 4 4 4 	''(=>>>>>s   =CC	!C	rE  r9   )rL   rM   rN   rO   r   r   r   r<   r    rr  rR   rS   rG   r   r   r   rT   rU   s   @r@   rj  rj    s       <<
 37	&
 &
 &
 )4/&
 	&
 &
 &
 &
 &
 &
V 37J J#J )4/J 
	J J J J$EL u|    ,?HU33D-E$F ? ? ? ? ? ? ? ?rB   rj  )rK  dummy_inputsc            	       P    e Zd Zdddedef fdZedededed	z  fd
            Zde	de
d	z  fdZde
dej        fdZde	ded	z  fdZ	 ddej        dej        deej                 d	z  dej        fdZdej        dej        d	z  fdZdeeeej        f                  fdZ xZS )%NemotronParseForConditionalGenerationr\   r   vllm_configr`   c                   t                                                       |j        j        }|| _        |j        | _        |j        }|j        }| 	                    |d          5  t          ||| d          | _        d d d            n# 1 swxY w Y   |                     |          5  t          |j        ||| d          | _        d d d            n# 1 swxY w Y   |j        j        | _        t          |j        j        |j        j        |          | _        t%          | j        |j        j                  | _        d S )Nr   z.encoder)r]   r_   r`   z.decoderre   rl  )r;   r<   model_configr}  r]   r8  vision_configr^   r_   _mark_tower_modelrj  _mark_language_modelr   decoderr   r   rg   lm_headr   logits_processor)r>   r  r`   r]   r^   r_   r?   s         r@   r<   z.NemotronParseForConditionalGeneration.__init__.  s   )3#^"/"/##K99 	 	(LFATATAT  DL	 	 	 	 	 	 	 	 	 	 	 	 	 	 	
 &&{33 	 	,)) ***	  DL	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 !.3%N%v~'=L
 
 
 !0OV^6!
 !
s$   %BBB,!CC Crc  irD   Nc                 N    |                     d          rd S t          d          )Nr   z Only image modality is supported)r   r  )clsrc  r  s      r@   get_placeholder_strz9NemotronParseForConditionalGeneration.get_placeholder_strL  s,    w'' 	4;<<<rB   r   c                    |                     dd           }|                     dd           }||d S ||t          d          |$| j        j        \  }}t	          d|||d          S |t
          t          d          )Nr   image_embedsz0Both pixel values and image embeds are provided.r  )r   r   resolve_bindingsz This line should be unreachable.)popr  r]   r  r   NotImplementedErrorAssertionError)r>   r   r   r  r   r   s         r@   _parse_and_validate_image_inputzENemotronParseForConditionalGeneration._parse_and_validate_image_inputS  s     zz.$77zz.$77L$84#(@OPPP#;)DAq+#!" "    #%%?@@@rB   image_inputc                     |d         dk    sJ |d         }t          | j                                                  j        }|                    |          }|                     |          S )Nr   r   r   )nextr8  
parametersdtypeto)r>   r  r   r  s       r@   _process_image_inputz:NemotronParseForConditionalGeneration._process_image_inputo  se     6"n4444"6*T\,,..//5#u--||L)))rB   c                 R     | j         di |}|d S |                     |          }|S )Nr   )r  r  )r>   r   r  vision_embeddingss       r@   embed_multimodalz6NemotronParseForConditionalGeneration.embed_multimodalx  s?    :d:DDVDD4 55kBB  rB   rC   	positionsencoder_outputsc                 h    d}|rt          j        |d          }|                     ||          }|S )a[  
        Args:
            input_ids: torch.Tensor of *decoder* input token ids.
            positions: torch.Tensor of *decoder* position indices.
            encoder_outputs: List of encoder output tensors (vision embeddings).
                During profiling, this may be None or empty.
        Returns:
            Output torch.Tensor
        Nr   r  )r   rz   )rR   r  r  )r>   rC   r  r  r   r   r}   s          r@   rG   z-NemotronParseForConditionalGeneration.forward  sJ       	>!Io1===M'} % 
 
 rB   r}   c                 8    |                      | j        |          S r9   )r  r  )r>   r}   s     r@   compute_logitsz4NemotronParseForConditionalGeneration.compute_logits  s     $$T\=AAArB   r   c                    t          | j                                                  }dt          dt          fd}dt          dt          fd}dt          fd}g }g }|D ]C\  }}	 ||          rF|                    d                    |                    d          dd                    |	f           W ||          rF|                    d                    |                    d          dd                    |	f            ||          rud                    |                    d          dd                    }
||
         }t          j	                    5  t          ||	           d d d            n# 1 swxY w Y   (t                              d|           E| j                            |           | j                            |           d S )	Nr   rD   c                 ,    |                      d          S )Nr8  r  r   s    r@   
is_encoderzFNemotronParseForConditionalGeneration.load_weights.<locals>.is_encoder      ??9---rB   c                 ,    |                      d          S )Nr  r  r  s    r@   
is_decoderzFNemotronParseForConditionalGeneration.load_weights.<locals>.is_decoder  r  rB   c                 ,    |                      d          S )Nr  r  r  s    r@   
is_lm_headzFNemotronParseForConditionalGeneration.load_weights.<locals>.is_lm_head  r  rB   r  r   zFound unexpected weight: %s)r   r  r   r   rA  r	  r  r  rR   r  r   loggerrK  r8  r   r  )r>   r   lm_head_dictr  r  r  encoder_weightsdecoder_weightsr   r   trimmed_namer   s               r@   r   z2NemotronParseForConditionalGeneration.load_weights  s:   DL99;;<<	.S 	.T 	. 	. 	. 	.	.S 	.T 	. 	. 	. 	.	.S 	. 	. 	. 	.  	A 	AGD!z$ 
A&&C1D(E(Eq'IJJJJD!! A&&C1D(E(Eq'IJJJJD!! A"xx

3(;<<$\2]__ 4 4)%3334 4 4 4 4 4 4 4 4 4 4 4 4 4 4 94@@@@ 	!!/222!!/22222s   E77E;	>E;	r9   )rL   rM   rN   r   r   r<   classmethodrP   r  rh  r   r  rR   rS   r  r   r  r   rG   r  r   r   r   rT   rU   s   @r@   r  r  (  s        BD 
 
 
z 
3 
 
 
 
 
 
< =3 =3 =3: = = = [=AA	!D	(A A A A8*3*	* * * *! !4H44O ! ! ! ! 6:	 < < el+d2	 
   0B|B 
	B B B B 3HU33D-E$F  3  3  3  3  3  3  3  3rB   r  )er   collections.abcr   r   r   typingr   r   r   numpyr   rR   torch.nnrm   einopsr   PILr	   timm.data.constantsr
   r   torchvisionr   r   transformersr   r   r   r   vllm.configr   r   vllm.config.lorar   vllm.config.multimodalr   vllm.loggerr   %vllm.model_executor.layers.activationr   !vllm.model_executor.layers.linearr   r   +vllm.model_executor.layers.logits_processorr   3vllm.model_executor.layers.quantization.base_configr   3vllm.model_executor.layers.vocab_parallel_embeddingr   r   -vllm.model_executor.model_loader.weight_utilsr   %vllm.model_executor.models.interfacesr   r    vllm.model_executor.models.radior    "vllm.model_executor.models.whisperr!   r"   vllm.multimodalr#   vllm.multimodal.inputsr$   r%   r&   vllm.multimodal.parser'   vllm.multimodal.processingr(   r)   r*   r+   r,   %vllm.transformers_utils.configs.radior-   !vllm.transformers_utils.tokenizerr.   vllm.utils.tensor_schemar/   r0   vllm.v1.attention.backendr1   rL   r  r   r3   rW   Moduler[   r   r   r   r   r  r)  rC  rO  rj  register_processorr  r   rB   r@   <module>r     s    7 7 7 7 7 7 7 7 7 7 % % % % % % % % 



                        A A A A A A A A ' ' ' ' ' '            0 / / / / / / / ' ' ' ' ' ' 3 3 3 3 3 3 # # # # # # < < < < < < U U U U U U U U G G G G G G R R R R R R        P O O O O O        8 7 7 7 7 7 V V V V V V V V / / / / / /         
 6 5 5 5 5 5              > = = = = = ; ; ; ; ; ; > > > > > > > > 3 3 3 3 3 3	X		' = = = = =4 = = = = = = = = = = =$b b b b bry b b bJ$ $ $ $ $( $ $ $Nr r r r r	 r r rj
A 
A 
A 
A 
A| 
A 
A 
Ao1 o1 o1 o1 o1 o1 o1 o1d(  (  (  (  (  (  (  ( V '  '  '  '  '"4  '  '  'F
 
 
 
 
67
 
 
.2
 2
 2
 2
 2
9:2
 2
 2
jb? b? b? b? b?BI b? b? b?J ('$	$0  
P3 P3 P3 P3 P3BI7I P3 P3 
P3 P3 P3rB   