
    .`i.                        d Z ddlmZ ddlZddlmZ ddlmZmZ ddl	m
Z
mZ ddlmZ ddlmZ ddlmZ dd	lmZmZmZ dd
lmZ ddlmZ ddlmZ dededefdZdededefdZ G d dej                  Z  G d dej                  Z! G d dej                  Z" G d dej                  Z# G d dej                  Z$ G d dej        e          Z%dS )zbMinimal implementation of BlipVisionModel intended to be only used
within a vision language model.    )IterableN)Blip2VisionConfigBlipVisionConfig)divide$get_tensor_model_parallel_world_size)
get_act_fn)MMEncoderAttention)Conv2dLayer)ColumnParallelLinearQKVParallelLinearRowParallelLinear)QuantizationConfig)default_weight_loader   )SupportsQuant
image_size
patch_sizereturnc                 "    | |z  dk    sJ | |z  S )Nr    r   r   s     s/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/vllm/model_executor/models/blip.pyget_blip_patch_grid_lengthr      s#    
"a''''##    c                 .    t          | |          }||z  S )Nr   )r   )r   r   grid_lengths      r   get_blip_num_patchesr       s(    ,*  K $$r   c                   N     e Zd Zdeez  f fdZdej        dej        fdZ xZ	S )BlipVisionEmbeddingsconfigc                    t                                                       || _        |j        | _        |j        | _        |j        | _        t          j        t          j
        dd| j                            | _        t          d| j        | j        | j                  | _        t          | j        | j                  | _        | j        dz   | _        t          j        t          j
        d| j        | j                            | _        d S )Nr      )in_channelsout_channelskernel_sizestrider   )super__init__r    hidden_size	embed_dimr   r   nn	Parametertorchrandnclass_embeddingr
   patch_embeddingr   num_patchesnum_positionsposition_embedding)selfr    	__class__s     r   r(   zBlipVisionEmbeddings.__init__)   s    + + +!|EK1dn,M,MNN*?	 
  
  
 04?
 
 
 "-1"$,K4-t~>>#
 #
r   pixel_valuesr   c                    |j         d         }| j        j        j        }|                     |                    |                    }|                    d                              dd          }| j                            |dd          }t          j
        ||gd          }| j                            |          }||d d d |                    d          d d f         z   }|S )Nr   )dtype   r   dim)shaper0   weightr8   toflatten	transposer/   expandr-   catr3   size)r4   r6   
batch_sizetarget_dtypepatch_embedsclass_embeds
embeddingsposition_embedss           r   forwardzBlipVisionEmbeddings.forwardC   s    !'*
+28++OO,O//
 
 $++A..88A>>+22:q"EEYl;CCC
144\BB/!!!5Izq7I7I5I1112L"MM
r   )
__name__
__module____qualname__r   r   r(   r-   TensorrK   __classcell__r5   s   @r   r   r   (   sq        
/2CC 
 
 
 
 
 
4EL U\        r   r   c            	       z     e Zd ZdZ	 	 ddeez  dedz  deddf fdZd	e	j
        d
edefdZde	j
        fdZ xZS )BlipAttentionz=Multi-headed attention from 'Attention Is All You Need' paperN r    quant_configprefixr   c           	         t                                                       || _        |j        | _        |j        | _        | j        | j        z  | _        | j        | j        z  | j        k    r t          d| j         d| j         d          | j        dz  | _	        |j
        | _        t          | j        | j        | j        |j        || d          | _        t          | j        | j        || d          | _        t#                      | _        t'          | j        | j                  | _        t+          | j        | j        | j	                  | _        d S )	Nz;embed_dim must be divisible by num_heads (got `embed_dim`: z and `num_heads`: z).g      z.qkvbiasrU   rV   z.projectionrU   rV   )r'   r(   r    r)   r*   num_attention_heads	num_headshead_dim
ValueErrorscaleattention_dropoutdropoutr   qkv_biasqkvr   
projectionr   tp_sizer   num_heads_per_partitionr	   attnr4   r    rU   rV   r5   s       r   r(   zBlipAttention.__init__W   s_    	+3$.8=4>)T^;;'%)^' 'N' ' '  
 ]D(
/$NMN%???
 
 
 ,NN%)))	
 
 
 <=='-dndl'K'K$&($-
 
			r   tensorseq_lenbszc                     |                     ||| j        | j                                      dd                                          S )Nr   r9   )viewr\   r]   rA   
contiguous)r4   ri   rj   rk   s       r   _shapezBlipAttention._shape   s6    KKWdndmDDYq!__Z\\	
r   hidden_statesc                     |                      |          \  }}|                    dd          \  }}}|                     |||          }|                     |          \  }}|dfS )z#Input shape: Batch x Time x Channelr"   r:   r;   N)rc   chunkrg   rd   )	r4   rp   
qkv_states_query_states
key_statesvalue_statesoutattn_outputs	            r   rK   zBlipAttention.forward   sm     //
A1;1A1A!1A1L1L.j,iij,??--QD  r   NrT   )rL   rM   rN   __doc__r   r   r   strr(   r-   rO   intro   rK   rP   rQ   s   @r   rS   rS   T   s        GG
 37	(
 (
 #44(
 )4/(
 	(

 
(
 (
 (
 (
 (
 (
T
U\ 
C 
c 
 
 
 
!|! ! ! ! ! ! ! !r   rS   c            	       `     e Zd Z	 	 d
dededz  deddf fdZdej        dej        fd	Z	 xZ
S )BlipMLPNrT   r    rU   rV   r   c                     t                                                       || _        t          |j                  | _        t          |j        |j        d|| d          | _	        t          |j        |j        d|| d          | _        d S )NTz.fc1rX   z.fc2)r'   r(   r    r   
hidden_actactivation_fnr   r)   intermediate_sizefc1r   fc2rh   s       r   r(   zBlipMLP.__init__   s     	'(9::'$%???
 
 
 %$%???
 
 
r   rp   c                     |                      |          \  }}|                     |          }|                     |          \  }}|S N)r   r   r   )r4   rp   rt   s      r   rK   zBlipMLP.forward   sG    88M22q**=9988M22qr   rz   rL   rM   rN   r   r   r|   r(   r-   rO   rK   rP   rQ   s   @r   r   r      s         37	
 
 
 )4/
 	

 

 
 
 
 
 
4U\ el        r   r   c            	       `     e Zd Z	 	 d
dededz  deddf fdZdej        dej        fd	Z	 xZ
S )BlipEncoderLayerNrT   r    rU   rV   r   c                 D   t                                                       t          ||| d          | _        t	          j        |j        |j                  | _        t          ||| d          | _
        t	          j        |j        |j                  | _        d S )Nz
.self_attnrZ   epsz.mlp)r'   r(   rS   	self_attnr+   	LayerNormr)   layer_norm_epslayer_norm1r   mlplayer_norm2rh   s       r   r(   zBlipEncoderLayer.__init__   s     	 '%(((
 
 

 <(:@UVVV6___UUU<(:@UVVVr   rp   c                     |}|                      |          }|                     |          \  }}||z   }|}|                     |          }|                     |          }||z   }|S )N)rp   )r   r   r   r   )r4   rp   residualrt   s       r   rK   zBlipEncoderLayer.forward   su     ((77>>>FFq =0 ((77// =0r   rz   r   rQ   s   @r   r   r      s         37	W W W )4/W 	W
 
W W W W W W$U\ el        r   r   c                   b     e Zd ZdZ	 	 	 ddededz  dedz  deddf
 fd	Zd
e	j
        fdZ xZS )BlipEncoderz
    Transformer encoder consisting of `config.num_hidden_layers` self
    attention layers. Each layer is a [`BlipEncoderLayer`].

    Args:
        config: BlipConfig
    NrT   r    rU   num_hidden_layers_overriderV   r   c                     t                                                       | _        |j        }n|}t	          j        fdt          |          D                       | _        d S )Nc           	      >    g | ]}t           d |           S )z.layers.)r    rU   rV   )r   ).0	layer_idxr    rV   rU   s     r   
<listcomp>z(BlipEncoder.__init__.<locals>.<listcomp>   sQ         !!!-$99i99    r   )r'   r(   r    num_hidden_layersr+   
ModuleListrangelayers)r4   r    rU   r   rV   r   r5   s    `` ` r   r(   zBlipEncoder.__init__   s     	%- & 8 :m      "''8!9!9  	
 	
r   inputs_embedsc                 4    |}| j         D ]} ||          }|S r   )r   )r4   r   rp   encoder_layers       r   rK   zBlipEncoder.forward   s/    %![ 	9 	9M)M-88MMr   )NNrT   )rL   rM   rN   r{   r   r   r}   r|   r(   r-   rO   rK   rP   rQ   s   @r   r   r      s          3715
 
 
 )4/
 %($J	

 
 

 
 
 
 
 
6U\        r   r   c                        e Zd ZeZdZdg diZ	 ddddddededz  d	edz  d
e	dz  de
ddf fdZdej        dej        fdZdeee
ej        f                  dee
         fdZ xZS )BlipVisionModelr6   qkv_proj)q_projk_projv_projNrT   )r   require_post_normrV   r    rU   r   r   rV   r   c                   t                                                       || _        t          |          | _        t          |||| d          | _        |j        }t          | j        j	                  |j        k    r-t          d| dt          | j        j	                   d          |t          | j        j	                  |k    }|r't          j        |j        |j                  | _        d S d | _        d S )Nz.encoder)r    rU   r   rV   zThe original encoder only has z layers, but you requested z layers.r   )r'   r(   r    r   rI   r   encoderr   lenr   r^   r+   r   r)   r   post_layernorm)r4   r    rU   r   r   rV   r   r5   s          r   r(   zBlipVisionModel.__init__  s.    	.v66"%'A&&&	
 
 
 #4t|"##f&>>>P1B P P-01D-E-EP P P   $ #DL$7 8 8<M M 	'"$,"(=# # #D #'Dr   c                     |                      |          }|                     |          }| j        |S |                     |          S )N)r   )rI   r   r   )r4   r6   rp   s      r   rK   zBlipVisionModel.forward2  sI    55=AA&  ""=111r   weightsc                 p   g d}t          |                                           }t                      }t          | j        j                  }|D ]\  }}|                    d          r| j        "|                    d          r/t          |	                    d          d                   }||k    rf|D ]>\  }	}
}|
|vr|
                    |
|	          }||         }|j        } ||||            n*||         }t          |dt                    } |||           |                    |           |S )N))r   r   q)r   r   k)r   r   vr   zencoder.layers.r9   weight_loader)dictnamed_parameterssetr   r   r   
startswithr   r}   splitreplacer   getattrr   add)r4   r   stacked_params_mappingparams_dictloaded_paramslayer_countnameloaded_weightr   
param_nameweight_nameshard_idparamr   s                 r   load_weightszBlipVisionModel.load_weights;  sl   "
 "
 "
 4002233"%%%$,-..#* 	$ 	$D-/00 T5H5P /00 

3 233	++5K 4 41
Kd**||K<<#D) % 3e]H===#D) '@U V Ve]333d####r   r   )rL   rM   rN   r   config_classmain_input_namepacked_modules_mappingr   r}   boolr|   r(   r-   rO   rK   r   tupler   r   rP   rQ   s   @r   r   r     s       #L$O(*H*H*HI
 37$'
 26)-$' $' $' $' )4/$'
 %($J$'  $;$' $' 
$' $' $' $' $' $'L2EL 2U\ 2 2 2 2#HU33D-E$F #3s8 # # # # # # # #r   r   )&r{   collections.abcr   r-   torch.nnr+   transformersr   r   vllm.distributedr   r   %vllm.model_executor.layers.activationr   9vllm.model_executor.layers.attention.mm_encoder_attentionr	   vllm.model_executor.layers.convr
   !vllm.model_executor.layers.linearr   r   r   'vllm.model_executor.layers.quantizationr   -vllm.model_executor.model_loader.weight_utilsr   
interfacesr   r}   r   r   Moduler   rS   r   r   r   r   r   r   r   <module>r      s  # # % $ $ $ $ $        < < < < < < < < I I I I I I I I < < < < < < X X X X X X 7 7 7 7 7 7         
 G F F F F F O O O O O O % % % % % %$c $s $s $ $ $ $
% % % % % % %) ) ) ) )29 ) ) )X?! ?! ?! ?! ?!BI ?! ?! ?!D         bi      F    ry   D) ) ) ) )") ) ) )XW W W W Wbi W W W W Wr   