
    .`i`              
          U d dl mZmZmZ d dlmZmZmZ d dlZd dl	m
Z
 d dlmZmZmZmZ d dlmZmZ d dlmZ d dlmZ d dlmZ d d	lmZ d d
lmZmZmZ d dlm Z  d dl!m"Z"m#Z#m$Z$m%Z%m&Z&m'Z' d dl(m)Z) d dl*m+Z+m,Z, ddl-m.Z.m/Z/ ddl0m1Z1m2Z2m3Z3m4Z4m5Z5 ddl6m7Z7 ddl8m9Z9m:Z:m;Z;  G d de+          Z< G d de+          Z=e<e=z  Z>ee?d<    G d de
j@                  ZA G d de
j@                  ZB G d de
j@                  ZC G d d e
j@                  ZD G d! d"e
j@                  ZE G d# d$e
j@                  ZF G d% d&e
j@                  ZG G d' d(e
j@                  ZH G d) d*e$          ZI G d+ d,e"eI                   ZJ G d- d.e#eI                   ZK ejL        eKeIeJ/           G d0 d1e
j@        e2e3e4e5                      ZMdS )2    )IterableMappingSequence)	AnnotatedLiteral	TypeAliasN)BatchFeatureBlip2ConfigBlip2QFormerConfigapply_chunking_to_forward)CacheConfig
VllmConfig)BaseDummyOptions)
get_act_fn)QuantizationConfig)MULTIMODAL_REGISTRY)MultiModalDataDictMultiModalFieldConfigMultiModalKwargsItems)MultiModalDataItems)BaseDummyInputsBuilderBaseMultiModalProcessorBaseProcessingInfoPromptIndexTargetsPromptInsertionPromptUpdate)IntermediateTensors)TensorSchemaTensorShape   )BlipVisionModelget_blip_num_patches)MultiModalEmbeddingsSupportsLoRASupportsMultiModal
SupportsPPSupportsQuant)MultiModelKeys)AutoWeightsLoaderinit_vllm_registered_modelmaybe_prefixc                   f    e Zd ZU dZed         ed<   eej         e	dddd          f         ed<   d	S )
Blip2ImagePixelInputsz
    Dimensions:
        - bn: Batch size * number of images
        - c: Number of channels (3)
        - h: Height of each image
        - w: Width of each image
    pixel_valuestypebn   hwdataN
__name__
__module____qualname____doc__r   __annotations__r   torchTensorr        t/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/vllm/model_executor/models/blip2.pyr-   r-   2   sW           .
!!!!
EL++dAsC"@"@@
AAAAAAr>   r-   c                   d    e Zd ZU dZed         ed<   eej         e	ddd          f         ed<   dS )	Blip2ImageEmbeddingInputsz
    Dimensions:
        - bn: Batch size * number of images
        - f: Image feature size
        - h: Hidden size (must match the hidden size of language model backbone)
    image_embedsr/   r0   fr2   r4   Nr5   r=   r>   r?   rA   rA   ?   sU           .
!!!!
EL++dC"="==
>>>>>>r>   rA   Blip2ImageInputsc                        e Zd Zddddededz  dedz  ded	ed
df fdZd Z		 dde
j        de
j        dz  fdZ xZS )Blip2QFormerMultiHeadAttentionF is_cross_attentionprefixconfigquant_configNcache_configrI   rJ   returnc                   t                                                       || _        |j        |j        z  dk    r t          d|j         d|j         d          |j        | _        |j        |j        z  | _        | j        | j        z  | _        | j        dz  | _        t          j
        |j        | j                  | _        |r|j        }n|j        }t          j
        || j                  | _        t          j
        || j                  | _        t          |dd          | _        | j        dk    rt#          d| j                   t          j        |j                  | _        d S )	Nr   zThe hidden size (z6) is not a multiple of the number of attention heads ()g      position_embedding_typeabsolutez%Unsupported position_embedding_type: )super__init__rK   hidden_sizenum_attention_heads
ValueErrorattention_head_sizeall_head_sizescalingnnLinearqueryencoder_hidden_sizekeyvaluegetattrrQ   NotImplementedErrorDropoutattention_probs_dropout_probdropout)selfrK   rL   rM   rI   rJ   kv_hidden_size	__class__s          r?   rT   z'Blip2QFormerMultiHeadAttention.__init__O   s~    	 ::a??PF$6 P P282LP P P  
 $*#= #)#59S#S !58PP/5Yv143EFF
 	0#7NN#/N9^T-?@@Y~t/ABB
'.-z(
 (
$ ':55%V8TVV   z&"EFFr>   c                      |j         g |                                d d         | j        | j        R  }|                    dddd          S )Nr      r    r1   )viewsizerV   rX   permute)rf   xs     r?   transpose_for_scoresz3Blip2QFormerMultiHeadAttention.transpose_for_scoresy   sQ    AFVAFFHHSbSMV4#;VT=UVVVyyAq!$$$r>   hidden_statesencoder_hidden_statesc                 :   |d u}|rQ|                      |                     |                    }|                      |                     |                    }nP|                      |                     |                    }|                      |                     |                    }|                     |          }|                      |          }t	          j        ||                    dd                    }t	          j        || j        z  d          }	| 	                    |	          }
t	          j        |
|          }|
                    dddd                                          } |j        g |                                d d         | j        R  }|S )Nrj   dimr   rk   r    r1   )rp   r_   r`   r]   r;   matmul	transposesoftmaxrZ   re   rn   
contiguousrl   rm   rY   )rf   rq   rr   rI   	key_layervalue_layermixed_query_layerquery_layerattention_scoresattention_probsattention_probs_droppedcontext_layers               r?   forwardz&Blip2QFormerMultiHeadAttention.forward}   s   
 3$> 	O11$((;P2Q2QRRI33DJJ?T4U4UVVKK11$((=2I2IJJI33DJJ}4M4MNNK JJ}55//0ABB <Y5H5HR5P5PQQ-(84<(GRPPP #',,"?"?%<kJJ%--aAq99DDFF** 
!!#2#&
(,(:
 
 
 r>   N)r6   r7   r8   r   r   r   boolstrrT   rp   r;   r<   FloatTensorr   __classcell__rh   s   @r?   rF   rF   N   s         $)(G (G (G"(G )4/	(G
 "D((G !(G (G 
(G (G (G (G (G (GT% % % ;?   |   %047               r>   rF   c                   `     e Zd Zd
dededdf fdZdej        dej        dej        fd	Z xZ	S )Blip2QFormerSelfOutputrG   rK   rJ   rN   Nc                    t                                                       t          j        |j        |j                  | _        t          j        |j        |j                  | _        t          j        |j	                  | _
        d S Neps)rS   rT   r[   r\   rU   dense	LayerNormlayer_norm_epsrc   hidden_dropout_probre   rf   rK   rJ   rh   s      r?   rT   zBlip2QFormerSelfOutput.__init__   sf    Yv163EFF
f&8f>STTTz&"<==r>   rq   input_tensorc                     |                      |          }|                     |          }|                     ||z             }|S r   r   re   r   rf   rq   r   s      r?   r   zBlip2QFormerSelfOutput.forward   B    
 

=11]33}|'CDDr>   rG   
r6   r7   r8   r   r   rT   r;   r<   r   r   r   s   @r?   r   r              > >1 >3 > > > > > > >| l 
	       r>   r   c                        e Zd Zddddededz  dedz  ded	ed
df fdZ	 dde	j
        de	j        dz  d
ee	j
                 fdZ xZS )Blip2QFormerAttentionFrG   rH   rK   rL   NrM   rI   rJ   rN   c                    t                                                       t          ||||| d          | _        t	          || d          | _        d S )N
.attentionrL   rM   rI   rJ   z.outputrJ   )rS   rT   rF   	attentionr   output)rf   rK   rL   rM   rI   rJ   rh   s         r?   rT   zBlip2QFormerAttention.__init__   sl     	7%%1(((
 
 
 -Vv<N<N<NOOOr>   rq   rr   c                 `    |                      ||          }|                     ||          }|S )Nrr   )r   r   )rf   rq   rr   self_outputattention_outputs        r?   r   zBlip2QFormerAttention.forward   s>    
 nn"7 % 
 
  ;;{MBBr>   r   )r6   r7   r8   r   r   r   r   r   rT   r;   r<   r   tupler   r   r   s   @r?   r   r      s         $)P P P"P )4/	P
 "D(P !P P 
P P P P P P0 ;?   |   %047  
u|		               r>   r   c                   R     e Zd Zd	dededdf fdZdej        dej        fdZ xZ	S )
Blip2QFormerIntermediaterG   rK   rJ   rN   Nc                     t                                                       t          j        |j        |j                  | _        t          |j                  | _	        d S r   )
rS   rT   r[   r\   rU   intermediate_sizer   r   
hidden_actintermediate_act_fnr   s      r?   rT   z!Blip2QFormerIntermediate.__init__   sK    Yv163KLL
#-f.?#@#@   r>   rq   c                 Z    |                      |          }|                     |          }|S r   )r   r   rf   rq   s     r?   r   z Blip2QFormerIntermediate.forward   s,    

=1100??r>   r   r   r   s   @r?   r   r      s        A A1 A3 A A A A A A AU\ el        r>   r   c                   `     e Zd Zd
dededdf fdZdej        dej        dej        fd	Z xZ	S )Blip2QFormerOutputrG   rK   rJ   rN   Nc                    t                                                       t          j        |j        |j                  | _        t          j        |j        |j                  | _        t          j	        |j
                  | _        d S r   )rS   rT   r[   r\   r   rU   r   r   r   rc   r   re   r   s      r?   rT   zBlip2QFormerOutput.__init__   sf    Yv79KLL
f&8f>STTTz&"<==r>   rq   r   c                     |                      |          }|                     |          }|                     ||z             }|S r   r   r   s      r?   r   zBlip2QFormerOutput.forward   r   r>   r   r   r   s   @r?   r   r      r   r>   r   c                        e Zd Zdddededz  dedz  deded	df fd
Zde	j
        de	j
        defdZde	j        d	e	j        fdZde	j        d	e	j        fdZ xZS )Blip2QFormerLayerrG   r   rK   rL   NrM   	layer_idxrJ   rN   c                   t                                                       |j        | _        d| _        t	          |||| d          | _        || _        ||j        z  dk    r$t	          |||d| d          | _        d| _	        nd| _	        t          || d	
          | _        t          || d
          | _        d S )Nr    r   rL   rM   rJ   r   Tz.crossattentionr   Fz.intermediate_queryr   z.output_query)rS   rT   chunk_size_feed_forwardseq_len_dimr   r   r   cross_attention_frequencycrossattentionhas_cross_attentionr   intermediate_queryr   output_query)rf   rK   rL   rM   r   rJ   rh   s         r?   rT   zBlip2QFormerLayer.__init__   s    	'-'E$.%%(((	
 
 
 #v771<<"7))#' 111# # #D (,D$$',D$":f999#
 #
 #
 /v>V>V>VWWWr>   rq   rr   query_lengthc           
         |                      |          }|dk    r|d d d |d d f         }| j        r|                     ||          }t          | j        | j        | j        |          }|j        d         |k    rHt          | j        | j        | j        |d d |d d d f                   }t          j
        ||gd          }n!t          | j        | j        | j        |          }|S )Nr   r   r    ru   )r   r   r   r   feed_forward_chunk_queryr   r   shapefeed_forward_chunkr;   cat)rf   rq   rr   r   r   query_attention_outputlayer_outputlayer_output_texts           r?   r   zBlip2QFormerLayer.forward  s-     >>-88!%5aaa,6I%J"' )-)<)<**? *= * *&
 5-, &	 L  %a(<77$=+0$$QQQqqq%89	% %!  %y,8I)JPQRRR4',  	 L r>   r   c                 \    |                      |          }|                     ||          }|S r   )intermediater   rf   r   intermediate_outputr   s       r?   r   z$Blip2QFormerLayer.feed_forward_chunkI  s2    "//0@AA{{#68HIIr>   c                 \    |                      |          }|                     ||          }|S r   )r   r   r   s       r?   r   z*Blip2QFormerLayer.feed_forward_chunk_queryN  s4    "556FGG(()<>NOOr>   )r6   r7   r8   r   r   r   intr   rT   r;   r   r   r<   r   r   r   r   s   @r?   r   r      s"        %X %X %X"%X )4/	%X
 "D(%X %X %X 
%X %X %X %X %X %XN(((  %0( 	( ( ( (T5< EL    
 %,        r>   r   c                   |     e Zd Zdddededz  dedz  deddf
 fd	Zd
ej	        dej	        de
dej        fdZ xZS )Blip2QFormerEncoderrG   r   rK   rL   NrM   rJ   rN   c                    t                                                       | _        t          j        fdt          j                  D                       | _        d S )Nc                 B    g | ]}t          | d |           S )z.layer.)rL   rM   r   rJ   )r   ).0r   rM   rK   rJ   rL   s     r?   
<listcomp>z0Blip2QFormerEncoder.__init__.<locals>.<listcomp>b  sW     	 	 	  "!-!-'$88Y88  	 	 	r>   )rS   rT   rK   r[   
ModuleListrangenum_hidden_layerslayerrf   rK   rL   rM   rJ   rh   s    ````r?   rT   zBlip2QFormerEncoder.__init__U  s     	]	 	 	 	 	 	 	 "'v'?!@!@	 	 	
 



r>   rq   rr   r   c                 t    t          | j        j                  D ]}| j        |         } ||||          }|S )Nrr   r   )r   rK   r   r   )rf   rq   rr   r   ilayer_modules         r?   r   zBlip2QFormerEncoder.forwardn  sT     t{455 	 	A:a=L(L&;)  MM r>   )r6   r7   r8   r   r   r   r   rT   r;   r   r   r<   r   r   r   s   @r?   r   r   T  s         
 
 
"
 )4/	

 "D(
 
 

 
 
 
 
 
2(  %0 	
 
       r>   r   c                   x     e Zd Zdddededz  dedz  deddf
 fd	Zd
ej	        dej	        dej
        fdZ xZS )Blip2QFormerModelrG   r   rK   rL   NrM   rJ   rN   c                   t                                                       || _        t          j        |j        |j                  | _        t          j        |j	                  | _
        t          |||| d          | _        d S )Nr   z.encoderr   )rS   rT   rK   r[   r   rU   r   	layernormrc   r   re   r   encoderr   s        r?   rT   zBlip2QFormerModel.__init__  s     	f&8f>STTTz&"<==*%%&&&	
 
 
r>   query_embedsrr   c                     |j         d         }|                     |          }|                     |          }|                     |||          }|S )Nr    r   )r   r   re   r   )rf   r   rr   r   embedding_outputsequence_outputs         r?   r   zBlip2QFormerModel.forward  s_    
 $)!,>>,77<<(899,,"7% ' 
 
 r>   )r6   r7   r8   r   r   r   r   rT   r;   r   r<   r   r   r   s   @r?   r   r     s         
 
 
"
 )4/	

 "D(
 
 

 
 
 
 
 
,'  %0 
	       r>   r   c                   B    e Zd Zd Zdeeedz  f         fdZdefdZdS )Blip2ProcessingInfoc                 @    | j                             t                    S r   )ctxget_hf_configr
   rf   s    r?   r   z!Blip2ProcessingInfo.get_hf_config  s    x%%k222r>   rN   Nc                 
    ddiS )Nimager    r=   r   s    r?   get_supported_mm_limitsz+Blip2ProcessingInfo.get_supported_mm_limits  s    |r>   c                 8    |                                  }|j        S r   )r   num_query_tokens)rf   	hf_configs     r?   get_num_image_tokensz(Blip2ProcessingInfo.get_num_image_tokens  s    &&((	))r>   )	r6   r7   r8   r   r   r   r   r   r   r=   r>   r?   r   r     sf        3 3 3cDj)A    *c * * * * * *r>   r   c            	       p    e Zd Zdeeef         defdZ	 ddedeeef         deeef         dz  defdZ	dS )	Blip2DummyInputsBuilder	mm_countsrN   c                     dS )NrG   r=   )rf   r   s     r?   get_dummy_textz&Blip2DummyInputsBuilder.get_dummy_text  s    rr>   Nseq_len
mm_optionsc                     | j                                         }|j        }|j        }|                    dd          }|r|                    d          nd }d|                     ||||          iS )Nr   r   )widthheight
num_images	overrides)infor   vision_config
image_sizeget_get_dummy_images)	rf   r   r   r   r   r  max_image_sizer   image_overridess	            r?   get_dummy_mm_dataz)Blip2DummyInputsBuilder.get_dummy_mm_data  s     I++--	!/&1]]7A..
5?I*..111T T++$%%)	 ,  
 	
r>   r   )
r6   r7   r8   r   r   r   r   r   r   r	  r=   r>   r?   r   r     s        S(9 c     =A	
 

 38$
 C!112T9	

 

 
 
 
 
 
r>   r   c            
            e Zd Zdedeeef         deeef         deeef         def
 fdZdedeeef         deeef         fd	Z	d
e
deeef         dedee         fdZ xZS )Blip2MultiModalProcessorpromptmm_data	mm_kwargs
tok_kwargsrN   c                     |sN| j                                         }|                    |          }t          t	          |g          d          S t                                          ||||          S )N)	input_idspt)tensor_type)r  r  r  r  )r  get_tokenizerencoder	   dictrS   _call_hf_processor)rf   r  r  r  r  	tokenizer
prompt_idsrh   s          r?   r  z+Blip2MultiModalProcessor._call_hf_processor  s      	P	//11I"))&11J
| < < <$OOOOww))!	 * 
 
 	
r>   	hf_inputshf_processor_mm_kwargsc                 l    t          t          j        d          t          j        d                    S )Nr   )r.   rB   )r  r   batched)rf   r  r  s      r?   _get_mm_fields_configz.Blip2MultiModalProcessor._get_mm_fields_config  s7    
 .6w??.6w??
 
 
 	
r>   mm_itemsout_mm_kwargsc                     | j                                         }|                                }|d         }| j                                         }|g|z  }t	          dt          j                    |          gS )Nz<image>r   )modalitytarget	insertion)r  r  	get_vocabr   r   r   start)	rf   r  r  r   r  vocabimage_token_idnum_image_tokensimage_tokenss	            r?   _get_prompt_updatesz,Blip2MultiModalProcessor._get_prompt_updates  s     I++--	##%%y)999;;&'*::  )/11&  
 	
r>   )r6   r7   r8   r   r   objectr	   r  r   r  r   r   r   r   r+  r   r   s   @r?   r  r    s       

 f%
 3;'	

 CK(
 

 
 
 
 
 
(

 !(V 4
 
++	,	
 
 
 

%
 !(V 4
 -	

 
,	
 
 
 
 
 
 
 
r>   r  )r  dummy_inputsc                       e Zd Zededededz  fd            Zddded	ef fd
Zde	de
dz  fdZdedej        dej        fdZdedej        fdZde
dej        fdZde	defdZ	 	 d#dej        dej        dedz  dej        dz  de	defdZdej        dej        dz  fdZdeeeej        f                  dee         fdZdefdZdedefd Zd!edefd"Z xZ S )$Blip2ForConditionalGenerationr"  r   rN   Nc                 N    |                     d          rd S t          d          )Nr   z Only image modality is supported)
startswithrW   )clsr"  r   s      r?   get_placeholder_strz1Blip2ForConditionalGeneration.get_placeholder_str  s,    w'' 	4;<<<r>   rG   r   vllm_configrJ   c          	      r   t                                                       |j        j        }|j        }|j        }|j        j        }|| _        || _        |j        }t          |j
        |j                  dz   | _        |                     |d          5  t          ||          | _        t!          j        t%          j        d|j        |j        j                            | _        t1          |j        ||| d          | _        t!          j        |j        j        |j        j        d          | _        d d d            n# 1 swxY w Y   |                     |          5  t=          ||j        t?          |d          	          | _         d d d            n# 1 swxY w Y   | j         j!        | _!        d S )
N)r  
patch_sizer    r   z.qformer)rM   rL   rJ   T)biaslanguage_model)r4  r   rJ   )"rS   rT   model_configr   rM   rL   multimodal_configrK   r  r"   r  r6  _vision_tokens_per_image_mark_tower_modelr!   vision_modelr[   	Parameterr;   zerosr   qformer_configrU   query_tokensr   qformerr\   text_configlanguage_projection_mark_language_modelr*   r+   r8  make_empty_intermediate_tensors)	rf   r4  rJ   rK   rM   rL   r:  r  rh   s	           r?   rT   z&Blip2ForConditionalGeneration.__init__  sK   )3"/"/'4F!2, (3(3   	 	% ##K99 	 	 /| L LD "v.0E0Q ! !D
 -%)) ***	  DL (*y%1".( ( (D$	 	 	 	 	 	 	 	 	 	 	 	 	 	 	& &&{33 	 	"<' ,#F,<==# # #D	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 ? 	,,,s%   B"EEE$+FF"Fkwargsc                     |                     dd           }|                     dd           }||d S |(| j        j        j        x}}t	          d|||d          S |t          d|          S t          d          )Nr.   rB   )r2   r3   )r/   r4   resolve_bindings)r/   r4   z This line should be unreachable.)poprK   r  r  r-   rA   AssertionError)rf   rG  r.   rB   
expected_h
expected_ws         r?   _parse_and_validate_image_inputz=Blip2ForConditionalGeneration._parse_and_validate_image_inputF  s     zz.$77zz.$77L$84#&*k&?&JJJ(#!'1
!C!C    #,#!   
 ?@@@r>   r=  r.   c                      ||          }|S r   r=   )rf   r=  r.   image_featuress       r?   _image_pixels_to_featuresz7Blip2ForConditionalGeneration._image_pixels_to_features_  s    
 &l33r>   inputsc                 H    |d         }|                      | j        |          S )Nr4   )rQ  r=  )rf   rR  r.   s      r?   _process_image_pixelsz3Blip2ForConditionalGeneration._process_image_pixelsh  s$    f~--d.?NNNr>   image_inputc                     |d         dk    r|d         S |                      |          }| j                            |j        d         dd          }|                     ||          }|                     |          S )Nr/   rB   r4   r   rj   )r   rr   )rT  rA  expandr   rB  rD  )rf   rU  rP  rA  query_outputs        r?   _process_image_inputz2Blip2ForConditionalGeneration._process_image_inputm  s    v.00v&&33K@@(//0DQ0GRPP||%"0 $ 
 

 ''555r>   c                 R     | j         di |}|g S |                     |          }|S )Nr=   )rN  rY  )rf   rG  rU  vision_embeddingss       r?   embed_multimodalz.Blip2ForConditionalGeneration.embed_multimodal{  s?    :d:DDVDDI 55kBB  r>   r  	positionsintermediate_tensorsinputs_embedsc                 J    |d}| j                             ||||          }|S )af  Run forward pass for BLIP-2.

        One key thing to understand is the `input_ids` already accounts for the
        positions of the to-be-inserted image embeddings.

        Concretely, consider a text prompt:
        `"Question: What's the content of the image? Answer:"`.

        Tokenizer outputs:
        `[2, 45641, 35, 653, 18, 5, 1383, 9, 5, 2274, 116, 31652, 35]`.

        To reserve space in KV cache, we have to insert placeholder tokens
        before they are inputted to the model, so the input processor prepends
        dummy tokens (denoted as `50265`), resulting in:
        `[50265, ..., 50265, 2, 45641, 35, ..., 31652, 35]`.

        We insert 32 tokens since it corresponds to the number of query
        embeddings outputted by the Q-Former and inputted to the language model.

        This way, the `positions` and `attn_metadata` are consistent
        with the `input_ids`.

        Args:
            input_ids: Flattened (concatenated) input_ids corresponding to a
                batch.

        Info:
            [`Blip2ImageInputs`][vllm.model_executor.models.blip2.Blip2ImageInputs]
        N)r_  )r8  model)rf   r  r]  r^  r_  rG  rq   s          r?   r   z%Blip2ForConditionalGeneration.forward  s?    L  + M+11y"6m 2 
 
 r>   rq   c                 6    | j                             |          S r   )r8  compute_logitsr   s     r?   rc  z,Blip2ForConditionalGeneration.compute_logits  s     "11-@@@r>   weightsc                 J    t          |           }|                    |          S r   )r)   load_weights)rf   rd  loaders      r?   rf  z*Blip2ForConditionalGeneration.load_weights  s#    "4((""7+++r>   c                 4    t          j        dddgd          S )Nr8  rB  rD  r=  )r8  	connectortower_model)r(   from_string_fieldr   s    r?   get_mm_mappingz,Blip2ForConditionalGeneration.get_mm_mapping  s+    /+ "78&
 
 
 	
r>   r)  c                 ~    |dk    rdS || j         j        z  dk    s
J d            || j         j        z  }|| j        z  S )Nr   zLThe number of image tokens must be a multiple of the number of query tokens.)rK   r   r;  )rf   r)  r   s      r?   get_num_mm_encoder_tokensz7Blip2ForConditionalGeneration.get_num_mm_encoder_tokens  s\     q  1$+">>!CCC* DCC &(DD
D999r>   num_vision_tokensc                 t    |dk    rdS || j         z  dk    s
J d            || j         z  }|| j        j        z  S )Nr   zQThe number of vision tokens must be a multiple of the number of tokens per image.)r;  rK   r   )rf   ro  r   s      r?   get_num_mm_connector_tokensz9Blip2ForConditionalGeneration.get_num_mm_connector_tokens  sZ     !!1 4#@@AEEE. FEE ')FF
DK888r>   )NN)!r6   r7   r8   classmethodr   r   r3  r   rT   r,  rD   rN  r!   r;   r<   rQ  r-   rT  rY  r#   r\  r   r   rc  r   r   setrf  r(   rl  rn  rq  r   r   s   @r?   r/  r/    s        =3 =3 =3: = = = [= BD -
 -
 -
z -
3 -
 -
 -
 -
 -
 -
^AA	D	 A A A A2+;@<	   O,A Oel O O O O
60@ 6U\ 6 6 6 6! !4H ! ! ! ! <@-1- -<- <- 2D8	-
 |d*- - 
- - - -^A|A 
	A A A A,HU33D-E$F ,3s8 , , , ,
 
 
 
 
:: 
: : : :99 
9 9 9 9 9 9 9 9r>   r/  )Ncollections.abcr   r   r   typingr   r   r   r;   torch.nnr[   transformersr	   r
   r   r   vllm.configr   r   vllm.config.multimodalr   %vllm.model_executor.layers.activationr   'vllm.model_executor.layers.quantizationr   vllm.multimodalr   vllm.multimodal.inputsr   r   r   vllm.multimodal.parser   vllm.multimodal.processingr   r   r   r   r   r   vllm.sequencer   vllm.utils.tensor_schemar   r   blipr!   r"   
interfacesr#   r$   r%   r&   r'   module_mappingr(   utilsr)   r*   r+   r-   rA   rD   r:   ModulerF   r   r   r   r   r   r   r   r   r   r  register_processorr/  r=   r>   r?   <module>r     s@   8 7 7 7 7 7 7 7 7 7 7 0 0 0 0 0 0 0 0 0 0                   0 / / / / / / / 3 3 3 3 3 3 < < < < < < F F F F F F / / / / / /         
 6 5 5 5 5 5                . - - - - - > > > > > > > > 7 7 7 7 7 7 7 7              + * * * * * N N N N N N N N N N
B 
B 
B 
B 
BL 
B 
B 
B	? 	? 	? 	? 	? 	? 	? 	? 46OO ) O O OO O O O ORY O O Od    RY   &!  !  !  !  ! BI !  !  ! H
 
 
 
 
ry 
 
 
       &Z Z Z Z Z	 Z Z Zz) ) ) ) )") ) ) )Z' ' ' ' '	 ' ' 'T	* 	* 	* 	* 	*, 	* 	* 	*
 
 
 
 
45HI 
 
 
82
 2
 2
 2
 2
67JK 2
 2
 2
j ('	(  
M9 M9 M9 M9 M9I|/]M9 M9 
M9 M9 M9r>   