
    .`iF                     @   U d dl Z d dlmZmZmZ d dlmZmZmZm	Z	 d dl
Zd dlZd dlmZ d dlmZmZmZmZmZ d dlmZ d dlmZ d dlmZ d dlmZ d d	lmZ d d
lm Z  d dl!m"Z" d dl#m$Z$ d dl%m&Z& d dl'm(Z(m)Z)m*Z*m+Z+ d dl,m-Z-m.Z.m/Z/m0Z0m1Z1m2Z2 d dl3m4Z4 d dl5m6Z6m7Z7m8Z8m9Z9m:Z: d dl;m<Z< d dl=m>Z>m?Z? ddl@mAZA ddlBmCZCmDZDmEZE ddlFmGZG ddlHmIZImJZJmKZK dZLdZMdZNdZOddddd iZPd!eQd"eQd#eQd$eQfd%ZRd<d'eQfd(ZS G d) d*ejT                  ZU G d+ d,e>          ZV G d- d.e>          ZW G d/ d0e>          ZXeWeXz  ZYe	eZd1<   d=d2Z[ G d3 d4e7          Z\ G d5 d6e4e\                   Z] G d7 d8e6e\                   Z^ e&j_        e^e\e]9           G d: d;ejT        eDeE                      Z`dS )>    N)IterableMappingSequence)	AnnotatedAnyLiteral	TypeAlias)BatchFeaturePretrainedConfigProcessorMixinSequenceFeatureExtractorSiglipVisionConfig)
VllmConfig)BaseDummyOptions)get_pp_group)LogitsProcessor)QuantizationConfig)ParallelLMHead)
LlamaModel)MultiModelKeys)MULTIMODAL_REGISTRY)MultiModalDataDictMultiModalFieldConfigMultiModalKwargsItemsNestedTensors)AudioProcessorItemsImageEmbeddingItemsImageProcessorItems	ImageSizeMultiModalDataItemsMultiModalDataParser)BaseDummyInputsBuilder)BaseMultiModalProcessorBaseProcessingInfoPromptReplacementPromptUpdateResolvedPromptUpdate)IntermediateTensors)TensorSchemaTensorShape   )Idefics2VisionTransformer)MultiModalEmbeddingsSupportsLoRASupportsMultiModal)AudioEmbedding)AutoWeightsLoaderWeightsMappermaybe_prefixiJ iK ih zsiglip-so400m-patch14-448        )vit_image_sizevit_patch_sizetoken_compression_factor
orig_widthorig_heighttarget_heighttarget_widthc                     || z  }||z  }||k     rd}|t          ||z            z
  }n|t          | |z            z
  }d}||fS )Nr   )int)r:   r;   r<   r=   ratio_widthratio_heightpadding_widthpadding_heights           u/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/vllm/model_executor/models/phi4mm.py_get_padding_sizerE   L   sh     +K ;.L\!!&[;-F)G)GG$s:+D'E'EE=((    	layer_idxc                     dddddddd}t          di ||}| d	k     r|j        | z   d
z   }n| d
z   }t          |d|          }|S )Ni  r4   i  siglip_vision_model      r5   )hidden_size
image_sizeintermediate_size
model_typenum_attention_headsnum_hidden_layers
patch_sizer   r+   F)configrequire_post_normnum_hidden_layers_override )r   rR   r,   )rH   kwargsvision_configmodel_configrR   vision_models         rD   get_navit_vision_modelr\   [   s    !+! M &@@@@@L1}}(:YFJ%M,#4  L rF   c                        e Zd ZdZ	 	 ddededz  dededdf
 fd	Z	 dd
ej	        dej	        fdZ
dej	        dej        dej        deej	                 fdZ xZS )Phi4MMImageEncoderzImage embedding. rT   quant_configNprefix	model_dirreturnc           
      >   t                                                       t          |d          r|j        n|j        }t          |j        t                    rA|j                            dd          | _	        |j                            dd          | _
        nd| _	        d| _
        t          | j	                  | _        | j        j        j        j        }|                                \  }}t!          t#          j        |                    }	|	dz  |k    sJ d| d	            |	dz  d
k    rt'          j        d          | _        |	dz  }	|}
|	dz  dz  | _        |	| _        |
| _        d | _        d | _        d| _        d| _        d| _        d| _        d| _        d| _         t'          j!        dd          | _"        d| _#        | j        dz  | _        | j        | j        k    s
J d            | j        s
J d            t'          j$        tK          j&        dd| j        | j#        dz  z  g                    | _'        t'          j$        tK          j&        ddd| j        | j#        dz  z  g                    | _(        |}d}t'          j)        |
| j#        dz  z  |          g}tU          d|          D ]=}|+                    t'          j,                    t'          j)        ||          g           >t'          j-        | | _.        |j/        | _/        d | _0        d| _1        d S )Nn_embdrH   type_featurepatch)rH   r6   zposition embedding size z is not squarer   )r   r+   r   r+   r+   Tsub_glbFr4   avg_pool_2d)kernel_sizestridezDuse_hd_transform and with_learnable_separator should have same valuez,learnable separator is only for hd transform)2super__init__hasattrre   rM   
isinstanceimg_processordictgetrH   rg   r\   
embeddingsposition_embeddingweightsizer?   mathsqrtnnReflectionPad2dimg_processor_paddingnum_img_tokensbase_feat_height_targetimage_dim_out	img_sizesimage_attention_maskuse_hd_transformwith_learnable_separatorhd_transform_orderfreeze_img_processor	crop_sizeimage_token_compression_cls	AvgPool2dimage_token_compressionbase_feat_height_reduction	Parametertorchzerosglb_GNsub_GNLinearrangeextendGELU
Sequentialimg_projection
vocab_sizeimg_featuresuse_out_place_operations)selfrT   r`   ra   rb   rM   	pe_weightLDHr   dim_projectiondepthlayers_	__class__s                  rD   rn   zPhi4MMImageEncoder.__init__x   s*    	 (/vx'@'@XfmmfFX f*D11 	(#155k2FFDN & 4 8 8 Q QDDN 'D3dnMMM&1DK	~~1	!!tqyyyFQFFFyyyq5A::)+);L)I)ID&FA Av!m'($*$(! !%(,%"+$)! ,9(')|!'L'L'L$*+''+'Cq'H$ $(EEEER FEE $TT&TTT$lKAt1D4SUV4VVWXX
 
 lKAq$,t/NPQ/QQR 
 
 %I ? BBN 

 q% 	R 	RAMM2799bi&O&OPQQQQ mV4 + (-%%%rF   
img_embedsc                    |                      ||          }| j        dk    r5|}| j        d u}t          | dd           d u}|s|rt	          t          j        |                    d                              }|                    d|||                    d                    }|	                    dddd          }|r| 
                    |          }|r|                     |          }|	                    dddd          }|                    d|                    d          |                    d          z  |                    d                    }|S t          )	N)patch_attention_maskrh   r|   r+   rG   r      r6   )rq   rg   r   getattrr?   rx   ry   rw   viewpermuter|   NotImplementedError)r   r   attention_maskimg_featurepatch_featureuse_token_compressionuse_paddingwidths           rD   get_img_featuresz#Phi4MMImageEncoder.get_img_features   s    ((^ ) 
 
 '''M$($@$L!!$(?FFdRK$  DIm&8&8&;&;<<== - 2 2um&8&8&<&<! ! !. 5 5aAq A A N$($>$>}$M$MM( P$($@$@$O$OM !. 5 5aAq A A - 2 2!&&q))M,>,>q,A,AA!&&r**! ! ! !!rF   pixel_valuesimage_sizesr   c           
      J   t          | j        t          j                  r/| j        d         j        j        }| j        d         j        j        }n"| j        j        j        }| j        j        j        }|}|j        \  }}}	}
}|}|                    dd          }| 	                    ||
                    t          j                                      dd                              |                    }| j        }| j        }| j        }t#          t%          j        |j        d                             x}}||k    r||k    sJ d| d| d| d            |                    |d||z  | j                  }| j        }|}g }g }t          |t          j                  r|                    dd          }t/          |          D ]}||         \  }
}|
|z  }
||z  }|
|z  }||d	df         }|                    d|||                              d||z  |||z  ||                                                              ddd
ddd                              d||z  ||z  ||z  |z                                            }| j                            d||z  dd          }t          j        ||gd                              dd||z  |z            }||dd	f         }|d	|         }|                    ||||                              |||z  |||z  ||                                                              ddd
ddd                              |d||z  |z                                            }|                    d|
|||z  ||z  d                              ddd
ddd                              d|
|z  |z  ||z  |z  ||z  |z            }|t=          |          dk    rk||d|dz   dd	ddd	df                             d|
|||z  ||z                                ddd
dd                              d|
|z  |z  ||z  |z            }t#          |dd	d	df                                                                                    }t#          |ddd	d	f                                                                                    }|d	d	d	|d	|f         }| j                            d|dd          } t#          ||d	|dz   dd	ddd	df                                                                                    |dz   z   ||z  z   }!nO| j                            d|
|z  |z  dd          } t#          |
|z  dz   | j!        z  dz   |
dz   |z  |z  z             }!t          j        || gd                              dd||z  |z            }| j"        dk    r2|#                    t          j        || j$        |gd                     nU| j"        dk    r2|#                    t          j        || j$        |gd                     ntK          d| j"         d          |!|d         j        d         k    s!J d|! d|d         j        d                      |#                    |!           g }"|D ]e}#|                     |#                    |                              |                    }$|"#                    |$&                    d                     f|"S )a  
        process image and return vision embeddings.

        pixel_values: (num_images, num_crops, c, h, w)
        image_sizes: [[h1, w1], [h2, w2]]
        image_attention_mask: num_images x num_crops x 32 x 32
        output: (num_images, num_img_tokens, hidden_size)
        r   r+   zbase_feat_height: z, base_feat_width: z	, expect z features for hd transformrG   r6   Nr         dimglb_subri   zhd_transform_order = z, not implementedz
temp_len: z, output_imgs[-1].shape[1]: )'rp   r   rz   r   biasdevicedtypeshapeflattenr   typer   
BoolTensortor~   r   r   r?   npry   r   r   Tensorr   reshape
contiguousr   r   repeatcatlensumitemr}   r   appendr   r   squeeze)%r   r   r   r   target_devicetarget_dtyper   
num_images	num_cropschwbsr   r~   base_resolutionr   base_feat_heightbase_feat_widthCr   output_imgs
output_len_bsB_global_img_featureglb_imgtemp_glb_GNsub_imgreshaped_image_attention_maskuseful_heightuseful_widthtemp_sub_GNtemp_lenimg_set_tensor_output_imgimg_feature_projs%                                        rD   forwardzPhi4MMImageEncoder.forward   s   ( d)2=99 	: /27>M.q16<LL /4;M.39L	)5);&
Iq!Q#++Aq11,, %%e&677??1EEHHWW
 

 #'">.%)%D"-09KA9N1O1O-P-PP? 777#::::J!1 J J /J J-J J J ;:; $(($68J
 
 
i.. 	.!r1--I99 I	( I	(CS>DAq_$A_$AQB ".c2A2g!6 #**1aA6633.33.  Aq!Q**3333.1KKaO	  % ( +,,Q5O0OQRTUVVK i+ 6A>>>FF214NNQRR G
 #37+G crclG
 Aq!,,33.33.  Aq!Q**69SSVWW   $ $(BB#'AA  Aq!Q**((,FF'+EE.1KKaO	  $ $/C8L4M4MPQ4Q4Q(a"q&j!$Q$1)DEW(,FF'+EE  WQ1a++W,,0JJO+/II  .  !$$A!QQQ'$J$N$N$P$P$U$U$W$W X X"#@Aqqq#I#M#M#O#O#T#T#V#VWW!!!!^m^]l]"BC"k00M1aHH,S(BF(ADqD!$Q$-FGKKMMRRTTUU$q(*&*DDE  #k00q++/II1a  UQY$"551u 004NNO  i+ 6A>>>FF214NNQRR G &)33""59gt{G-LRS#T#T#TUUUU(I55""59gt{G-LRS#T#T#TUUUU)VD,CVVV  
 {24Q7777.X . .r?(+. . 877
 h''''& 	? 	?K#22}--00>>    !!"2":":1"="=>>>>rF   )r_   r_   N)__name__
__module____qualname____doc__r   r   strrn   r   FloatTensorr   r   listr   __classcell__r   s   @rD   r^   r^   u   s        P. P. P. )4/P. 	P.
 P. 
P. P. P. P. P. P.f =A$" $"+$"		$" $" $" $"LQ'Q \Q $l	Q
 
e	 Q Q Q Q Q Q Q QrF   r^   c            
       6   e Zd ZU dZed         ed<   eej        e	ej                 z   e
ddddddh	          f         ed<   eej         e
dd
          f         ed<   ee	e          e
d          f         ed<   eej         e
dddd          f         ed<   dS )Phi4MMImagePixelInputsaX  
    Dimensions:
        - bn: Batch size * number of images
        - p: Number of patches (1 + num_patches)
        - c: Number of channels (3)
        - h: Height of each image patch
        - w: Width of each image patch
        - nc: Number of crops
        - H_mask: Height of attention mask
        - W_mask: Width of attention mask
    r   r   bnpr   r   r   dynamic_dimsr6   r   r}   nc    r   N)r   r   r   r   r   __annotations__r   r   r   r   r*   r?   rW   rF   rD   r   r     s        
 
 .
!!!!tEL))#q#s#	
 	
 	
	    D!	   
 S	D	   
 $D$B''	)     rF   r   c                       e Zd ZU dZed         ed<   eej        e	ej                 z   e
ddddh          f         ed<   dS )	Phi4MMAudioFeatureInputsz^
    Dimensions:
        - bn: Batch size * number of audios
        - t: Time frames (M)
    audio_featuresr   r   tP   r   N)r   r   r   r   r   r   r   r   r   r   r*   rW   rF   rD   r   r     sy           "
####tEL))D#r666	8     rF   r   c                   \    e Zd ZU dZed         ed<   ee edddd          f         ed<   d	S )
Phi4MMAudioEmbeddingInputsz
    Dimensions:
        - b: Batch size
        - n: Number of audios
        - f: Audio feature size
        - h: Hidden size (must match language model backbone)
    audio_embedsr   bnfr   dataN)	r   r   r   r   r   r   r   r   r*   rW   rF   rD   r  r    se           .
!!!!
Cc3''	)     rF   r  Phi4MMAudioInputsc                      d                                          t          fd dd         D                       s
J d             fdt                    D             }t          fd D                       |<    d                             ||          }d} D ]Sfdt                    D             }t          ||j                 z             |<   ||<   |j                 z  }T|S )	z<
    cat along dim, while pad to max for all other dims
    r   c              3   H   K   | ]}|                                 k    V  d S r   r   ).0r   ndims     rD   	<genexpr>zcat_with_pad.<locals>.<genexpr>  s/      441quuww$444444rF   r+   Nz3All tensors must have the same number of dimensionsc                 H    g | ]t          fd D                       S )c              3   2   K   | ]}|j                  V  d S r   r   )r  r   is     rD   r  z*cat_with_pad.<locals>.<listcomp>.<genexpr>  s)      001AGAJ000000rF   )max)r  r  tensorss    @rD   
<listcomp>z cat_with_pad.<locals>.<listcomp>  s7    FFFQ000000000FFFrF   c              3   2   K   | ]}|j                  V  d S r   r  )r  r   r   s     rD   r  zcat_with_pad.<locals>.<genexpr>  s)      66666666rF   c                 F    g | ]}t          d j        |                   S r   )slicer   )r  dr   s     rD   r  z cat_with_pad.<locals>.<listcomp>  s)    <<<1%171:&&<<<rF   )r   allr   r   new_fullr  r   )	r  r   padding_valueout_sizeoutputindexslicesr  r   s	   ``     @@rD   cat_with_padr"    s/    1:>>D444444444  = 4 GFFF%++FFFH6666g66666HSMQZ  =99FE  <<<<d<<<E5173<#788svMrF   c                   L   e Zd Zedee         fd            Zedee         fd            Z	 d dedz  de	fdZ
dedefdZdeee	dz  f         fd	Zd
e	de	de	de	de	f
dZ	 d!d
e	de	de	de	de	de	fdZddde	de	dedz  de	fdZ	 d dedz  defdZde	dede	fdZde	de	fdZdS )"Phi4MMProcessingInforc   c                 4    d t          d          D             S )Nc                      g | ]}d |dz    dS )<|image_r+   |>rW   r  r  s     rD   r  z5Phi4MMProcessingInfo.image_tokens.<locals>.<listcomp>)  (    999$1q5$$$999rF   d   r   r   s    rD   image_tokensz!Phi4MMProcessingInfo.image_tokens'      99eCjj9999rF   c                 4    d t          d          D             S )Nc                      g | ]}d |dz    dS )<|audio_r+   r(  rW   r)  s     rD   r  z5Phi4MMProcessingInfo.audio_tokens.<locals>.<listcomp>-  r*  rF   r+  r,  r-  s    rD   audio_tokensz!Phi4MMProcessingInfo.audio_tokens+  r/  rF   N	processorc                 J    ||                                  }|j        }|j        S r   )get_hf_processorimage_processor
dynamic_hd)r   r4  r7  s      rD   get_dynamic_hdz#Phi4MMProcessingInfo.get_dynamic_hd/  s,     --//I#3))rF   rX   c                 &     | j         di |j        S )NrW   )r6  audio_processor)r   rX   s     rD   get_feature_extractorz*Phi4MMProcessingInfo.get_feature_extractor8  s    $t$..v..>>rF   c                     d d dS )N)audioimagerW   r-  s    rD   get_supported_mm_limitsz,Phi4MMProcessingInfo.get_supported_mm_limits;  s    ---rF   r:   r;   rN   max_nummin_numc                    t          j        |t          |          z            }t          j        |t          |          z            }||z  k    r||z  }t          fdt	          ddz             D                       }	t          |	d           }	|                                 j        }
|
                    ||	|||          }||d         z  }||d         z  }n||z  }||z  }||f}|||fS )Nc              3   p   K   | ]0}t          d d z             D ]}||z  k    ||z  k    ||fV  1dS )r+   Nr,  )r  r  jrA  rB  s      rD   r  zAPhi4MMProcessingInfo._find_target_aspect_ratio.<locals>.<genexpr>L  su          q'A+..    q5G##A(8(8 A )9(8(8(8(8	   rF   r+   c                 $    | d         | d         z  S )Nr   r+   rW   )xs    rD   <lambda>z@Phi4MMProcessingInfo._find_target_aspect_ratio.<locals>.<lambda>R  s    !qt rF   )keyr   )	rx   ceilfloatsetr   sortedr6  r7  find_closest_aspect_ratio)r   r:   r;   rN   rA  rB  
w_crop_num
h_crop_numaspect_ratiotarget_ratiosr7  target_aspect_ratior=   r<   s       ``        rD   _find_target_aspect_ratioz.Phi4MMProcessingInfo._find_target_aspect_ratio>  sE    YzE*,=,==>>
Y{U:->->>??

"W,,%3L            q'A+..       M #=6K6KLLLM #3355EO"1"K"K# # &(;A(>>L&)<Q)??MM%
2L&3M#-z":"M<??rF   r6   dynamic_hd_sizer7   r8   r9   c                 0   ||z  dk    s
J d            ||z  |z  dk    s
J d            |                      ||||d          \  }}}	|d         |z  |	k    sJ |d          d| d|	             |d         |z  |k    sJ |d          d| d|             ||z  dk    r	|	|z  dk    sJ t          ||||	          \  }
}|dk    s|
dk    s
J d            |	|z  }||z  }||k    r-|
dk    s
J d	            |t          j        ||z            z
  }|}n7|
|k    r-|dk    s
J d
            |t          j        |
|z            z
  }|}n|}|}||z  }||z  }||z  dk    r|dz  }||z  dk    r|dz  }||z  }|}||z  }||z  dz  }d}||z  }||z   |z   |z   |z   S )av  
        compute the number of tokens an image is expected to take up considering
        the image encoder architecture and exclude output features containing
        only padding pixels

        for siglip, vit_image_size=448, vit_patch_size=14, so output will be
        32x32 feature map
        NOTE right now, Phi4MM uses hard-coded token_compression_factor=2
        r   z2vit_image_size must be divisible by vit_patch_sizezNvit_image_size // vit_patch_size must be divisible by token_compression_factorr+   )rB  z * z != z)padding_width or padding_height must be 0zpadding_height not 0zpadding_width not 0r6   )rT  rE   rx   floor)r   r:   r;   rU  r7   r8   r9   rS  r<   r=   rC   rB   target_feat_widthtarget_feat_heightnon_pad_feat_widthnon_pad_feat_height
feat_widthfeat_heightnum_hd_patch_tokensnum_hd_newline_tokensvit_feature_sizenum_global_image_tokensnum_sep_tokensnum_global_image_newline_tokenss                           rD   _compute_num_image_tokensz.Phi4MMProcessingInfo._compute_num_image_tokensg  s   $ .!333@ 433 /2JJaOOO' POO **KRS +   	9]L
 #1%6,FFF"1%LL.LLlLL GFF #1%6-GGG"1%MM.MMmMM HGG N*a//L>4QUV4V4V4VV ):]L)
 )
% !!^q%8%8%87 &9%88 )N:*n<N**!Q&&&(>&&&!2TZ.6 6 " #5~-- A%%%'<%%%"4tz/8 8 # "3 "3"4'+CC
)-EE 88A==!OJ!99Q>>1K(;6 +)^;#37O#OTU"U*:>V*V' $!" $$ .	.	
rF   r4  image_widthimage_heightc                    |                                  }|j        }|t          }t          |         }|d         }|d         }|d         }	|                     |          }
|                     |||
|||	          }|S )Nr7   r8   r9   re  )rU  r7   r8   r9   )get_hf_configrq   SIGLIP_NAME#VISION_ENCODER_TO_PROCESSING_CONFIGr9  rd  )r   rf  rg  r4  	hf_configvision_encoder_nameprepro_configr7   r8   r9   rU  image_num_tokenss               rD   get_num_image_tokensz)Phi4MMProcessingInfo.get_num_image_tokens  s     &&((	'5&"-;<OP&'78&'78#01K#L --	-BB99+))%= : 
 
  rF   c                     |                                  }|j        }|t          }t          |         }|d         }||                     |          z  }t          ||          S )Nr7   re  )heightr   )ri  rq   rj  rk  r9  r   )r   r4  rl  rm  rn  r7   max_sides          rD   !get_image_size_with_most_featuresz6Phi4MMProcessingInfo.get_image_size_with_most_features  sn     &&((	'5&"-;<OP&'78!D$7$7)$7$L$LL????rF   	audio_lensrc                     |dk    r	||dz  z  }n.d|cxk    rdk     r	n n|dz  }n|dk     rt          d|           d}d}||z
  |z  dz   }|dk     rt          d          |S )	a  
        Compute the output size of the `extract_features` method.

        Args:
            audio_len (int): Length of the input waveform in samples.
            sr (float): Sampling rate of the waveform, either 16000 or 8000.

        Returns:
            tuple (int, int): Output size as (T, D), where:
                T: Number of time frames.
                D: Number of Mel filterbank bins (80).
        i>  i@  r6   zUnsupported sample rate i     r+   z(Waveform too short for given parameters.)RuntimeError
ValueError)r   ru  rv  
win_length
hop_length
num_framess         rD   get_audio_num_framesz)Phi4MMProcessingInfo.get_audio_num_frames  s     ::"+%IIR%NII$YY>">>??? 

  *,;a?
>>GHHH rF   audio_framesc                     |                                  }|j        d         d         }d}||z  }||z  }|dk    r|n|dz   }||z  }||z  }|dk    r|n|dz   }|S )zj
        Compute the audio embedding size based on the audio frames and
        compression rate.
        audio_embd_layercompression_rater+   r   )ri  
embd_layer)r   r  rl  r  qformer_compression_rateinteger	remainderresults           rD   _compute_audio_embed_sizez.Phi4MMProcessingInfo._compute_audio_embed_size  s    
 &&((	$/0BCDVW $% "22 #33	%NN!4455	%NN!rF   r   )r6   )r   r   r   propertyr   r   r.  r3  r   r?   r9  objectr   r<  r   r@  rT  rd  rp  r   rt  rK  r~  r  rW   rF   rD   r$  r$  &  sr       :d3i : : : X: :d3i : : : X:
 ,0* *!D(* 
* * * *?f ?9Q ? ? ? ?.cDj)A . . . .'@'@ '@ 	'@
 '@ '@ '@ '@ '@` )*W
 W
W
 W
 	W

 W
 W
 #&W
 W
 W
 W
| ,0        	 
 "D(  
       > ,0@ @!D(@ 
@ @ @ @!c !u ! ! ! ! !Fc c      rF   r$  c            	       p    e Zd Zdeeef         defdZ	 ddedeeef         deeef         dz  defdZ	dS )	Phi4MMDummyInputsBuilder	mm_countsrc   c                     |                     dd          }|                     dd          }| j        j        d |         }| j        j        d |         }d                    ||z             S )Nr>  r   r?  r_   )rs   infor.  r3  join)r   r  
num_audiosr   r.  r3  s         rD   get_dummy_textz'Phi4MMDummyInputsBuilder.get_dummy_text&  se    ]]7A..
]]7A..
"&)"8*"E"&)"8*"Eww|l2333rF   Nseq_len
mm_optionsc                 h   |                     dd          }|                     dd          }| j                                        \  }}|r|                     d          nd }|r|                     d          nd }	|                     ||||          |                     t
          ||	          d}
|
S )Nr>  r   r?  )r   rr  r   	overrides)lengthr  r  )r?  r>  )rs   r  rt  _get_dummy_images_get_dummy_audios_AUDIO_MAX_SOUNDFILE_SIZE)r   r  r  r  r  r   r=   r<   image_overridesaudio_overridesmm_datas              rD   get_dummy_mm_dataz*Phi4MMDummyInputsBuilder.get_dummy_mm_data/  s     ]]7A..
]]7A..
&*i&Q&Q&S&S#m5?I*..111T5?I*..111T ++"$%)	 ,   ++0%) ,  
 
 rF   r   )
r   r   r   r   r   r?   r  r   r   r  rW   rF   rD   r  r  %  s        4S(9 4c 4 4 4 4 =A	  38$ C!112T9	
 
     rF   r  c            
            e Zd ZdefdZdedeeef         deeef         deeef         def
 fdZ	ded	eeef         deee
f         fd
Zded	eeef         dedee         fdZdededef fdZ xZS )Phi4MMMultiModalProcessorrc   c                 `    | j                                         }t          |j        d          S )Nscipy)	target_sraudio_resample_method)r  r<  r!   sampling_rate)r   feature_extractors     rD   _get_data_parserz*Phi4MMMultiModalProcessor._get_data_parserO  s5     I;;==#'5W
 
 
 	
rF   promptr  	mm_kwargs
tok_kwargsc                 8   
 |sa j                                                             |          }                     |          }t	          t          |g          d          S   j         j        di |j        |                    dg           x}rfd|D             |d<   t                      
                    ||||          } fd|d         D             }||d<   |d	         
 fd
|D             }	
fdt          |	          D             |d	<   |S )N)	input_idspt)tensor_typeaudiosc                     g | ]}|fS rW   rW   )r  r  rv  s     rD   r  z@Phi4MMMultiModalProcessor._call_hf_processor.<locals>.<listcomp>c  s     C C C$ C C CrF   c                 `    g | ]*}j                             |d          |d                   +S )r   r+   )rf  rg  )r  rp  )r  img_sizer   s     rD   r  z@Phi4MMMultiModalProcessor._call_hf_processor.<locals>.<listcomp>i  sO     
 
 
  I**$QKhqk +  
 
 
rF   r   r}   input_audio_embedsc                 `    g | ]*}j                             t          |                    +S rW   )r  r~  r   )r  r>  r   rv  s     rD   r  z@Phi4MMMultiModalProcessor._call_hf_processor.<locals>.<listcomp>r  s>     
 
 
?DDI**3u::r::
 
 
rF   c                 .    g | ]\  }}|d |f         S r   rW   )r  idxrw   r   s      rD   r  z@Phi4MMMultiModalProcessor._call_hf_processor.<locals>.<listcomp>u  s7     3
 3
 3
+43N3:&3
 3
 3
rF   rW   )r  get_tokenizerencode_apply_hf_processor_tokens_onlyr
   rr   r<  r  rs   rm   _call_hf_processor	enumerate)r   r  r  r  r  
prompt_ids
audio_dataprocessed_outputsr}   feature_sizesr   rv  r   s   `         @@rD   r  z,Phi4MMMultiModalProcessor._call_hf_processorU  s     	P002299&AAJ==jIIJ
| < < <$OOOO,TY,99y99G Xr222: 	D C C C C
 C C CGH!GG66GY

 

 
 
 
 .m<	
 
 
 /=*+*+?@
 
 
 
 
HR
 
 
3
 3
 3
 3
8A-8P8P3
 3
 3
./ ! rF   	hf_inputshf_processor_mm_kwargsc           	          t          t          j        d          t          j        d          t          j        d          t          j        d          t          j        d                    S )Nr?  r>  )input_image_embedsr   r   r}   r  )rr   r   batched)r   r  r  s      rD   _get_mm_fields_configz/Phi4MMMultiModalProcessor._get_mm_fields_config{  sa    
 4<WEE!6!>w!G!G-5g>>08AA4<WEE
 
 
 	
rF   mm_itemsout_mm_kwargsc                    	  j         j        } j         j        }  j         j        di |  j         j        di |	dt
          f	 fd}dt
          f fd}t          d|j        |          t          d|j        |          gS )Nitem_idxc                 ,                        dt          t          f          }t          |t                    r|                    |           }n<|                    |           }j                            |j        |j	                  }t          g|z  S )Nr?  )rf  rg  r4  )	get_itemsr   r   rp   get_feature_sizeget_image_sizer  rp  r   rr  _IMAGE_PLACEHOLDER_TOKEN_ID)r  imagesnum_image_tokensrN   hf_processorr  r   s       rD   get_image_replacement_phi4mmzSPhi4MMMultiModalProcessor._get_prompt_updates.<locals>.get_image_replacement_phi4mm  s    ''-/BC F &"566 #)#:#:8#D#D  #228<<
#'9#A#A * 0!+!2* $B $ $  003CCCrF   c                                          dt                    }|                    |           }j                            |j                  }j                            |          }t          g|z  S )Nr>  )r  r   get_audio_lengthr  r~  r  r  _AUDIO_PLACEHOLDER_TOKEN_ID)r  r  ru  r  audio_embed_sizer  r  r   s        rD   get_audio_replacement_phi4mmzSPhi4MMMultiModalProcessor._get_prompt_updates.<locals>.get_audio_replacement_phi4mm  ss    ''1DEEF//99I999,: L  $yBB<PP/03CCCrF   r?  )modalitytargetreplacementr>  rW   )r  r.  r3  r<  r6  r?   r%   __getitem__)
r   r  r  r  r.  r3  r  r  r  r  s
   ``      @@rD   _get_prompt_updatesz-Phi4MMMultiModalProcessor._get_prompt_updates  s    #')"8"&)"8;DI;UU>TUU1ty1KK4JKK	D3 	D 	D 	D 	D 	D 	D 	D 	D"		D3 		D 		D 		D 		D 		D 		D 		D 		D  #/8  
  #/8  
 	
rF   cached_updatenew_item_idxc                    t                                          ||          }|j        dk    r(| j        j        }|                    ||                   }n2|j        dk    r'| j        j        }|                    ||                   }|S )Nr?  r>  )rm   _recompute_cached_prompt_updater  r  r.  with_targetr3  )r   r  r  
new_updater.  r3  r   s         rD   r  z9Phi4MMMultiModalProcessor._recompute_cached_prompt_update  s    
 WW<<
 


 !W,,&*i&<L#//\0JKKJJ#w..&*i&<L#//\0JKKJrF   )r   r   r   r!   r  r   r   r  r
   r  r   r  r    r   r   r   r&   r  r'   r?   r  r   r   s   @rD   r  r  N  sl       
"6 
 
 
 
$!$! f%$! 3;'	$!
 CK($! 
$! $! $! $! $! $!L

 !(V 4
 
++	,	
 
 
 
2
%2
 !(S 12
 -	2

 
,	2
 2
 2
 2
h+  
	         rF   r  )r  dummy_inputsc                       e Zd ZdZdgdgdZ eddiddd	d
d          Zedede	dedz  fd            Z
dddedef fdZdededz  fdZdededefdZdededz  fdZdedefdZdedeej                 fdZdedefdZ	 	 d*d ej        d!ej        d"edz  d#ej        dz  dedej        fd$Zd%ej        dej        dz  fd&Zd'ee eej        f                  ddfd(Z!de"fd)Z# xZ$S )+Phi4MMForCausalLMzA
    Implements the Phi-4-multimodal-instruct model in vLLM.
    qkv_projgate_up_proj)r  r  zbase_layer.r_   z0embed_tokens_extend.audio_projection_for_vision.z%embed_tokens_extend.audio_projection.zembed_tokens_extend.zvision_encoder.)z>model.embed_tokens_extend.audio_embed.audio_projection.vision.z>model.embed_tokens_extend.audio_embed.audio_projection.speech.z&model.embed_tokens_extend.audio_embed.z&model.embed_tokens_extend.image_embed.)orig_to_new_substrorig_to_new_prefixr  r  rc   Nc                     |                     d          rd| dS |                     d          rd| dS t          d          )Nr?  r'  r(  r>  r2  z)Only image or audio modality is supported)
startswithrz  )clsr  r  s      rD   get_placeholder_strz%Phi4MMForCausalLM.get_placeholder_str  s_    w'' 	$#a####w'' 	$#a####DEEErF   )ra   vllm_configra   c          	         t                                                       |j        j        }|j        j        }|s
J d            |j        }|| _        || _        || _        t                      j        dk    s
J d            | 	                    |ddh          5  t          ||d|j                  | _        d d d            n# 1 swxY w Y   t          |j        d         t                    r#d	|j        d         d	         i|j        d         }nd	| j        j        d	         i}| 	                    |d
          5  t!          |fi || _        d d d            n# 1 swxY w Y   |                     |          5  t'          |t)          |d                    | _        d d d            n# 1 swxY w Y   t-          |j        |j        |t)          |d                    | _        |j        r)| j                            | j        j                  | _        t;          |dd          }t=          |j        |          | _        d S )Nzmultimodal_config is requiredr+   z"pipeline parallel is not supportedr?  videozmodel.vision_embed_tokens)ra   rb   r  embedding_clsr>  model)r  ra   lm_head)r`   ra   logit_scaleg      ?)scale) rm   rn   rZ   rl  multimodal_configr`   rT   r   
world_size_mark_tower_modelr^   _name_or_pathvision_encoderrp   r  rr   r0   embed_tokens_extend_mark_language_modelr   r3   r  r   r   rM   r  tie_word_embeddingstie_weightsembed_tokensr   r   logits_processor)	r   r  ra   rT   r  r`   embedding_configr  r   s	           rD   rn   zPhi4MMForCausalLM.__init__  s6   )3'4F AA"AAA "/!2( ~~(A---/S---##K'71CDD 	 	"42 .	# # #D	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 f'(:;TBB 	!23E!F!W #$67   !7!H  ##K99 	R 	R'5f'Q'Q@P'Q'QD$	R 	R 	R 	R 	R 	R 	R 	R 	R 	R 	R 	R 	R 	R 	R &&{33 	 	#'VW0M0M  DJ	 	 	 	 	 	 	 	 	 	 	 	 	 	 	
 &%	22	
 
 
 % 	M<33DJ4KLLDLfmS99 /0A U U Us6   CC	C	=EE #E <%F--F14F1rX   c                     |                     dd          }|                     dd          }||dS |t          d|          S |t          d|          S t          d          )aL  
        Parse and validate the audio input to the model.  This handles both
        audio features and audio embeddings, but only the former is used for
        now.

        Args:
            kwargs (object): Keyword arguments.

        Returns:
            Optional[Phi4MMAudioInputs]: Parsed and validated audio inputs.
        r  Nr  r   )r   r   )r   r  z This line should be unreachable.)popr   r  AssertionError)r   rX   r   r  s       rD   _parse_and_validate_audio_inputz1Phi4MMForCausalLM._parse_and_validate_audio_input+  s      $8$??zz.$77!l&:4%+%-   
 #->UUUU?@@@rF   audio_inputaudio_projection_modec                      |d         dk    r|d         S |d         }t           j                                                  j         fd|D             }|S )ad  
        Create the audio embeddings from the audio input, where the audio input
        is pairs of audio features and audio embed lengths.  The audio input is
        created by `input_mapper_for_phi4mm_audio`.

        Args:
            audio_input (Phi4MMAudioInputs): Audio input.

        Returns:
            NestedTensors: Audio embeddings
        r   r  r  r   c                 d    g | ],}                     |                                         -S )r  )r  r   )r  featuresr  r   r   s     rD   r  z:Phi4MMForCausalLM._process_audio_input.<locals>.<listcomp>`  sR     
 
 

 	 $$E""&; %  
 
 
rF   )nextr  
parametersr   )r   r  r  r   r  r   s   ` `  @rD   _process_audio_inputz&Phi4MMForCausalLM._process_audio_inputJ  s     v.00v&&$%56 T-88::;;A
 
 
 
 
 

 +
 
 
 rF   c                     |                     d          }|d S |                     d          }|                     d          }|                     d          }|||
J d            t          d||||          S )Nr  r   r   r}   zMissing image inputsr   )r   r   r   r   r}   )rs   r   )r   rX   r   r   r   r}   s         rD   _parse_and_validate_image_inputz1Phi4MMForCausalLM._parse_and_validate_image_inputi  s     zz"6774jj//%zz*@AA$455#$0**! +*+
 &%#!5)
 
 
 	
rF   c                 t    i }|D ]2}|dv rd|vr | j         di ||d<   |dv rd|vr | j        di ||d<   3|S )N)r  image_embedsr  )r  r  r  rW   )r  r  )r   rX   
modalities	input_keys       rD   %_parse_and_validate_multimodal_inputsz7Phi4MMForCausalLM._parse_and_validate_multimodal_inputs  s    
   
	V 
	VICCCJ..'Kt'K'U'Uf'U'U
8$CCCJ..'Kt'K'U'Uf'U'U
8$rF   image_inputc                     t          | j                                                  j        }|d                             |          }|d         }|d         }|                     |||          }|S )Nr   r   r   )r  r  r  r   r   )r   r  r   r   r   r   r  s          rD   _process_image_inputz&Phi4MMForCausalLM._process_image_input  sv     T(335566<">255e<<!-0*+AB**+';
 
 rF   c                     | j         di |}|sg S d}d}|D ]p}|dk    r1d}|d         }|                     |          }|t          |          z  }|dk    r1|d         }|                     ||          }	|t          |	          z  }q|S )NrW   speechr  visionr  r
  )r  r  tupler  )
r   rX   r  multimodal_embeddingsr  r  r  image_embeddingsr  audio_embeddingss
             rD   embed_multimodalz"Phi4MMForCausalLM.embed_multimodal  s    ?T?II&II
 	I ;= !)" 	A 	AH8##(0%(2#'#<#<[#I#I %/?)@)@@%8##(2#'#<#<7L $= $ $  &/?)@)@@%$$rF   r  	positionsintermediate_tensorsinputs_embedsc                 @    |d }|                      ||||          }|S )N)r#  )r  )r   r  r!  r"  r#  rX   hidden_statess          rD   r   zPhi4MMForCausalLM.forward  s;      + M

 '	 # 
 
 rF   r%  c                 <    |                      | j        |          }|S r   )r   r  )r   r%  logitss      rD   compute_logitsz Phi4MMForCausalLM.compute_logits  s      &&t|]CCrF   weightsc                 ^    t          | dg          }|                    || j                  S )Nlora)skip_substrs)mapper)r1   load_weightshf_to_vllm_mapper)r   r)  loaders      rD   r.  zPhi4MMForCausalLM.load_weights  s2    "4vh???""743I"JJJrF   c                 8    t          j        dddgddg          S )z<
        Get the module prefix in multimodal models
        zmodel.audio_projection_for_visionaudio_projectionr  r  )language_model	connectortower_model)r   from_string_fieldr-  s    rD   get_mm_mappingz Phi4MMForCausalLM.get_mm_mapping  s3     /#46HI)+@A
 
 
 	
rF   )NN)%r   r   r   r   packed_modules_mappingr2   r/  classmethodr   r?   r  r   rn   r  r	  r  r   r  r   r  rr   r  r   r   r   r  r-   r   r(   r   r(  r   r  r.  r   r8  r   r   s   @rD   r  r    s         
 
	  &2
 OANu6L6G	
 
	
 
 
 F3 F3 F3: F F F [F BD 1V 1V 1Vz 1V3 1V 1V 1V 1V 1V 1VfAA	T	!A A A A>,EH	   >

	$	&
 
 
 
0f     &
1
	el	
 
 
 
% %4H % % % %@ <@-1 < < 2D8	
 |d*  
   (| 
	   KHU33D-E$F K4 K K K K
 
 
 
 
 
 
 
 
rF   r  )rG   r  )arx   collections.abcr   r   r   typingr   r   r   r	   numpyr   r   torch.nnrz   transformersr
   r   r   r   r   vllm.configr   vllm.config.multimodalr   vllm.distributedr   +vllm.model_executor.layers.logits_processorr   'vllm.model_executor.layers.quantizationr   3vllm.model_executor.layers.vocab_parallel_embeddingr    vllm.model_executor.models.llamar   )vllm.model_executor.models.module_mappingr   vllm.multimodalr   vllm.multimodal.inputsr   r   r   r   vllm.multimodal.parser   r   r   r   r    r!   vllm.multimodal.processingr"   $vllm.multimodal.processing.processorr#   r$   r%   r&   r'   vllm.sequencer(   vllm.utils.tensor_schemar)   r*   idefics2_vision_modelr,   
interfacesr-   r.   r/   phi4mm_audior0   utilsr1   r2   r3   r  r  r  rj  rk  r?   rE   r\   Moduler^   r   r   r  r	  r   r"  r$  r  r  register_processorr  rW   rF   rD   <module>rU     sc    7 7 7 7 7 7 7 7 7 7 5 5 5 5 5 5 5 5 5 5 5 5                         # " " " " " 3 3 3 3 3 3 ) ) ) ) ) ) G G G G G G F F F F F F      8 7 7 7 7 7 D D D D D D / / / / / /                           > = = = = =              . - - - - - > > > > > > > > < < < < < < N N N N N N N N N N ( ( ( ( ( ( A A A A A A A A A A % $ # )$%" "' #))"%)69)IL) ) ) ) c    4L L L L L L L L^
# # # # #\ # # #L    |             8:TT 9 T T T   4| | | | |- | | |~& & & & &56JK & & &R     78L M   D ('	)  
N
 N
 N
 N
 N
	<1C N
 N
 
N
 N
 N
rF   