
    .`iFY                        d dl mZmZ d dlmZ d dlmZ d dlZd dlm	Z	 d dl
mZmZ d dlmZ g dZ ee          Z G d	 d
e	          Z G d de          Z G d de          Z G d dee          Z G d dee          Z G d de          Z G d de          Z G d de          Z G d de          Z G d dee          Z G d de          Zded ed!ed"efd#Zd$ Zd% Z d& Z!dS )'    )CallableHashable)Fraction)WeakValueDictionaryN)	Parameter)get_tensor_model_parallel_rank$get_tensor_model_parallel_world_size)init_logger)BasevLLMParameterPackedvLLMParameterPerTensorScaleParameterModelWeightParameterChannelQuantScaleParameterGroupQuantScaleParameterPackedColumnParameterRowvLLMParameterc                   j    e Zd ZdZdej        dz  f fdZdej        defdZe	defd            Z
e
j        d	efd
            Z
e
j        d             Z
dej        fdZdej        fdZdej        fdZdej        fdZdej        fdZdej        fdZdeez  defdZed fd	            Z xZS )r   z
    Base parameter for vLLM linear layers. Extends the torch.nn.parameter
    by taking in a linear weight loader. Will copy the loaded weight
    into the parameter when the provided weight loader is called.
    dataNc                 L    t                                          | |d          S NF)r   requires_gradsuper__new__)clsr   kwargs	__class__s      q/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/vllm/model_executor/parameter.pyr   zBasevLLMParameter.__new__&   s    wwsUCCC    weight_loaderc                     ddl m} |                                r|                    |          }|| _        t                      | _        t                      | _        dS )z
        Initialize the BasevLLMParameter

        :param data: torch tensor with the parameter data
        :param weight_loader: weight loader callable

        :returns: a torch.nn.parameter
        r   )current_platformN)	vllm.platformsr"   use_sync_weight_loadermake_synced_weight_loader_weight_loaderr   tp_rankr	   tp_size)selfr   r    r"   s       r   __init__zBasevLLMParameter.__init__)   sd    $ 	4333332244 	V,FF}UUM+577;==r   returnc                 V    | j         t          | j        j         d          | j         S )Nz) weight_loader attribute has been deleted)r&   AttributeErrorr   __name__r)   s    r   r    zBasevLLMParameter.weight_loaderD   s;     & >*UUU   ""r   valuec                     || _         d S Nr&   )r)   r0   s     r   r    zBasevLLMParameter.weight_loaderP   s    #r   c                     d | _         d S r2   r3   r/   s    r   r    zBasevLLMParameter.weight_loaderT   s    "r   loaded_weightc                     | j         j        dk    o| j                                         dk    }|j        dk    o|                                dk    }|o|S N   r   )r   ndimnumel)r)   r5   cond1cond2s       r   _is_1d_and_scalarz#BasevLLMParameter._is_1d_and_scalarX   sS    	!#>	(9(9Q(>"a'FM,?,?,A,AQ,Fr   c                     | j         j        |j        k    s|                     |          sJ | j                             |           d S r2   )r   shaper=   copy_r)   r5   s     r   _assert_and_loadz"BasevLLMParameter._assert_and_load]   sP    y-"5559O9O:
 :
55 
 		&&&&&r   c                 0    |                      |           d S r2   rB   rA   s     r   load_column_parallel_weightz-BasevLLMParameter.load_column_parallel_weightc       m,,,,,r   c                 0    |                      |           d S r2   rD   rA   s     r   load_row_parallel_weightz*BasevLLMParameter.load_row_parallel_weightf   rF   r   c                 0    |                      |           d S r2   rD   r)   r5   r   s      r   load_merged_column_weightz+BasevLLMParameter.load_merged_column_weighti   rF   r   c                 0    |                      |           d S r2   rD   rJ   s      r   load_qkv_weightz!BasevLLMParameter.load_qkv_weightl   rF   r   shard_idc                     t          |t                    r|S dddd}t          |t                    sJ ||v sJ ||         S )Nr   r8      )qkv)
isinstanceintstr)r)   rN   qkv_idxss      r   _shard_id_as_intz"BasevLLMParameter._shard_id_as_into   s\    h$$ 	O ++(C(((((8####!!r    c                 T    |i }t                                          ||||          S r2   )r   __torch_function__)r   functypesargsr   r   s        r   r[   z$BasevLLMParameter.__torch_function__z   s+    >Fww))$tVDDDr   )rY   N)r.   
__module____qualname____doc__torchTensorr   r   r*   propertyr    setterdeleterr=   rB   rE   rH   rK   rM   rV   rU   rX   classmethodr[   __classcell__r   s   @r   r   r      s        D5<$. D D D D D D>U\ >( > > > >6 	#x 	# 	# 	# X	# $8 $ $ $ $ # # #u|    
'el ' ' ' '- - - - --el - - - --u| - - - --U\ - - - -	"s 	"s 	" 	" 	" 	" E E E E E [E E E E Er   r   c                        e Zd ZdZdef fdZed             Zdej	        fdZ
dej	        fdZdej	        fdZ xZS )	_ColumnvLLMParametera  
    Private class defining weight loading functionality
    (load_merged_column_weight, load_qkv_weight)
    for parameters being loaded into linear layers with column
    parallelism. This includes QKV and MLP layers which are
    not already fused on disk. Requires an output dimension
    to be defined. Called within the weight loader of
    each of the column parallel linear layers.
    
output_dimc                 H    || _          t                      j        di | d S NrY   )_output_dimr   r*   )r)   rl   r   r   s      r   r*   z_ColumnvLLMParameter.__init__   s.    %""6"""""r   c                     | j         S r2   )ro   r/   s    r   rl   z_ColumnvLLMParameter.output_dim       r   r5   c                     | j         j        | j                 }|                    | j        | j        |z  |          }| j         j        |j        k    sJ | j                             |           d S r2   )r   r?   rl   narrowr'   r@   r)   r5   
shard_sizes      r   rE   z0_ColumnvLLMParameter.load_column_parallel_weight   sk    Y_T_5
%,,OT\J6

 
 y-"55555	&&&&&r   c                    |                     d          }|                     d          }t          | t          t          f          r*| j        | j        k    r|                     ||          \  }}| j        }|                    | j        ||          }|                    | j        | j	        |z  |          }|j
        |j
        k    sJ |                    |           d S )Nshard_offsetru   rw   ru   )getrT   r   r   
packed_dimrl    adjust_shard_indexes_for_packingr   rs   r'   r?   r@   )r)   r5   r   rw   ru   
param_datas         r   rK   z._ColumnvLLMParameter.load_merged_column_weight   s    zz.11ZZ--
 t35HIJJ	4?22'+'L'L)j (M ( ($J Y
&&tjQQ
%,,OT\J6

 
 =#66666'''''r   c                 :   |                     d          }|                     d          }|                     d          }|                     d          }t          | t          t          f          r*| j        | j        k    r|                     ||          \  }}| j        }|dk    r| j        n	| j        |z  }|	                    | j        ||          }|	                    | j        ||z  |          }|j
        |j
        k    sJ |                    |           d S )Nrw   ru   rN   	num_headsrx   rQ   )ry   rT   r   r   rl   rz   r{   r   r'   rs   r?   r@   )r)   r5   r   rw   ru   rN   r~   r|   s           r   rM   z$_ColumnvLLMParameter.load_qkv_weight   s'   zz.11ZZ--
::j))JJ{++	 t35HIJJ	4?22'+'L'L)j (M ( ($J Y
#+s??4<<	8Q&&tjQQ
%,,OX
2J
 
 =#66666'''''r   )r.   r_   r`   ra   rU   r*   rd   rl   rb   rc   rE   rK   rM   rh   ri   s   @r   rk   rk      s         #3 # # # # # #     X ' ' ' ' '(u| ( ( ( (,(U\ ( ( ( ( ( ( ( (r   rk   c                   T     e Zd ZdZdef fdZed             Zdej	        fdZ
 xZS )r   z
    Parameter class defining weight_loading functionality
    (load_row_parallel_weight) for parameters being loaded
    into linear layers with row parallel functionality.
    Requires an input_dim to be defined.
    	input_dimc                 H    || _          t                      j        di | d S rn   )
_input_dimr   r*   )r)   r   r   r   s      r   r*   zRowvLLMParameter.__init__   s-    #""6"""""r   c                     | j         S r2   )r   r/   s    r   r   zRowvLLMParameter.input_dim   s
    r   r5   c                 8   | j         j        | j                 }|                    | j        | j        |z  |          }t          |j                  dk    r|                    d          }| j         j        |j        k    sJ | j                             |           d S )Nr   r8   )r   r?   r   rs   r'   lenreshaper@   rt   s      r   rH   z)RowvLLMParameter.load_row_parallel_weight   s    Y_T^4
%,,NDL:5z
 
 }"##q(()11!44My-"55555	&&&&&r   )r.   r_   r`   ra   rU   r*   rd   r   rb   rc   rH   rh   ri   s   @r   r   r      s         ## # # # # # #   X
'el 
' 
' 
' 
' 
' 
' 
' 
'r   r   c                       e Zd ZdZdS )r   z]
    Parameter class for linear layer weights. Uses both column and
    row parallelism.
    Nr.   r_   r`   ra   rY   r   r   r   r               
 	Dr   r   c                       e Zd ZdZdS )r   z
    Parameter class for weight scales loaded for weights with
    grouped quantization. Uses both column and row parallelism.
    Nr   rY   r   r   r   r      r   r   r   c                       e Zd ZdZdS )r   z
    Parameter class for weight scales loaded for weights with
    channel-wise quantization. Equivalent to _ColumnvLLMParameter.
    Nr   rY   r   r   r   r      r   r   r   c                   b     e Zd ZdZ fdZ fdZd Zd Z fdZde	j
        deez  fd	Z xZS )
r   a  
    Parameter class for scales where the number of scales is
    equivalent to the number of logical matrices in fused linear
    layers (e.g. for QKV, there are 3 scales loaded from disk).
    This is relevant to weights with per-tensor quantization.
    Adds functionality to map the scalers to a shard during
    weight loading.

    Note: additional parameter manipulation may be handled
    for each quantization config specifically, within
    process_weights_after_loading
    c                 :     t                      j        di | d S rn   )r   r*   )r)   r   r   s     r   r*   z PerTensorScaleParameter.__init__  s&    ""6"""""r   c                 :     t                      j        |i | d S r2   r   rH   r)   r^   r   r   s      r   rH   z0PerTensorScaleParameter.load_row_parallel_weight  %    (($9&99999r   c                       | j         |i | d S r2   _load_into_shard_idr)   r^   r   s      r   rK   z1PerTensorScaleParameter.load_merged_column_weight         $1&11111r   c                       | j         |i | d S r2   r   r   s      r   rM   z'PerTensorScaleParameter.load_qkv_weight  r   r   c                 :     t                      j        |i | d S r2   r   r   s      r   rE   z3PerTensorScaleParameter.load_column_parallel_weight   r   r   r5   rN   c                    | j         }|                     |          }t          |j                  dk    r|j        d         dk    sJ |d         }||         }|j        |j        k    sJ |                    |           dS )zU
        Slice the parameter data based on the shard id for
        loading.
        r   r8   N)r   rX   r   r?   r@   )r)   r5   rN   r   r|   s        r   r   z+PerTensorScaleParameter._load_into_shard_id#  s     Y
((22 }"##q(( &q)Q....)!,M)
=#66666'''''r   )r.   r_   r`   ra   r*   rH   rK   rM   rE   rb   rc   rV   rU   r   rh   ri   s   @r   r   r     s         # # # # #
: : : : :2 2 22 2 2: : : : :("\(583Y( ( ( ( ( ( ( (r   r   c            
            e Zd ZdZ	 	 ddeez  dededz  dedz  f fdZed             Zed	             Z	ed
             Z
ed             Zd Z xZS )r   z
    Parameter for model parameters which are packed on disk
    and support column parallelism only. See PackedvLLMParameter
    for more details on the packed properties.
    Npacked_factorrz   marlin_tile_sizebitblas_tile_sizec                 r    || _         || _        || _        || _         t	                      j        di | d S rn   _packed_factor_packed_dim_marlin_tile_size_bitblas_tile_sizer   r*   r)   r   rz   r   r   r   r   s         r   r*   zPackedColumnParameter.__init__@  H     ,%!1"3""6"""""r   c                     | j         S r2   r   r/   s    r   rz   z PackedColumnParameter.packed_dimN  rq   r   c                     | j         S r2   r   r/   s    r   r   z#PackedColumnParameter.packed_factorR      ""r   c                     | j         S r2   r   r/   s    r   r   z&PackedColumnParameter.marlin_tile_sizeV      %%r   c                     | j         S r2   r   r/   s    r   r   z'PackedColumnParameter.bitblas_tile_sizeZ      &&r   c                 H    t          ||| j        | j        | j                  S Nru   rw   r   r   r   !_adjust_shard_indexes_for_packingr   r   r   r)   ru   rw   s      r   r{   z6PackedColumnParameter.adjust_shard_indexes_for_packing^  1    0!%,!2"4
 
 
 	
r   NNr.   r_   r`   ra   rU   r   r*   rd   rz   r   r   r   r{   rh   ri   s   @r   r   r   9  s         (,(,# #X~# # *	#
 :# # # # # #     X  # # X# & & X& ' ' X'
 
 
 
 
 
 
r   r   c            
            e Zd ZdZ	 	 ddeez  dededz  dedz  f fdZed             Zed	             Z	ed
             Z
ed             Zd Z xZS )r   a  
    Parameter for model weights which are packed on disk.
    Example: GPTQ Marlin weights are int4 or int8, packed into int32.
    Extends the ModelWeightParameter to take in the
    packed factor, the packed dimension, and optionally, marlin
    tile size for marlin kernels. Adjusts the shard_size and
    shard_offset for fused linear layers model weight loading
    by accounting for packing and optionally, marlin tile size.
    Nr   rz   r   r   c                 r    || _         || _        || _        || _         t	                      j        di | d S rn   r   r   s         r   r*   zPackedvLLMParameter.__init__s  r   r   c                     | j         S r2   r   r/   s    r   rz   zPackedvLLMParameter.packed_dim  rq   r   c                     | j         S r2   r   r/   s    r   r   z!PackedvLLMParameter.packed_factor  r   r   c                     | j         S r2   r   r/   s    r   r   z$PackedvLLMParameter.marlin_tile_size  r   r   c                     | j         S r2   r   r/   s    r   r   z%PackedvLLMParameter.bitblas_tile_size  r   r   c                 H    t          ||| j        | j        | j                  S r   r   r   s      r   r{   z4PackedvLLMParameter.adjust_shard_indexes_for_packing  r   r   r   r   ri   s   @r   r   r   h  s         (,(,# #X~# # *	#
 :# # # # # #     X  # # X# & & X& ' ' X'
 
 
 
 
 
 
r   r   c                       e Zd ZdZdS )BlockQuantScaleParameterz
    Parameter class for weight scales loaded for weights with
    block-wise quantization. Uses both column and row parallelism.
    Nr   rY   r   r   r   r     r   r   r   c                   F    e Zd ZU dZ e            Zeed<   eej	                 ed<   e
eeez  f         ed<    fdZdded	ef fd
ZdedefdZdej	        fdZdej	        fdZdej	        fdZdej	        fdZd Zed             Zdedej	        deez  dz  fdZ xZS )SharedWeightParametera*  
    Parameter for weights with many shared tensors across a model

    For example, when applying transforms to the "gate" and "up" partitions of
    `MergedColumnParallelLinear`, the transform weights must stay separate
    tensors in order to allow for tensor memory sharing between layers.
    tensors_registrylocal_tensors
partitionsc                 <     t                      j        | fdd i|S )Nr   r   )r   r   r   s     r   r   zSharedWeightParameter.__new__  s&    uwws888888r   r8   r   r   rl   c                    |                     d          }t                                          d |           t                      | _        i | _        ||| j        d| _        | j        dk    rt          | j
        j         d          d S )Nr    )r   r    )r   rl   r    r8   z. does not currently support tensor parallelism)ry   r   r*   setr   r   _fake_weight_loaderr   r(   NotImplementedErrorr   r.   )r)   r   rl   r   r    r   s        r   r*   zSharedWeightParameter.__init__  s    "(**_"="=d-@@@ UU"$!5
 
 <!%>* 7 7 7   r   indexdata_keyc                     || j         vrt          j        |i |}|| j         |<   n| j         |         }t          dd|i| j        | j        |<   | j                            |           dS )ab  
        Add a partition to the weight parameter. Partitions whose `data_key`
        is the same will share tensor data

        :param index: index of partition to add
        :param data_key: hashable key used to key shared tensors
        :param *args: arguments for `torch.empty`
        :param **kwargs: keyword arguments for `torch.empty`
        r   NrY   )r   rb   emptyr   r   r   r   add)r)   r   r   r^   r   r   s         r   add_partitionz#SharedWeightParameter.add_partition  s     4000;///D.2D!(++(2D "6!O!O4!O4;!O!O 	t$$$$$r   r5   c                     t          | j                  dk    r	d| j        v sJ | j        d         }t                              ||           d S r7   )r   r   r   rE   r)   r5   	partitions      r   rE   z1SharedWeightParameter.load_column_parallel_weight  sQ    4?##q((Q$/-A-A-AAOA&	88MRRRRRr   c                     t          | j                  dk    r	d| j        v sJ | j        d         }t                              ||           d S r7   )r   r   r   rH   r   s      r   rH   z.SharedWeightParameter.load_row_parallel_weight  sQ    4?##q((Q$/-A-A-AAOA&	55iOOOOOr   c                 <   |                     d          }|                     |          }| j        |         }| j                            d          }|j                            |          | j        z  }| j        |z  }t          
                    ||||           d S )NrN   r   rx   )poprX   r   r   ry   r   sizer(   r'   r   rK   )r)   r5   r   partition_idr   r   ru   rw   s           r   rK   z/SharedWeightParameter.load_merged_column_weight  s    zz*--,,\::OL1	KOOK00	^((33t|C
|j066}<J 	7 	
 	
 	
 	
 	
r   c                 j   |                      |                    d                    }| j        |         }| j                            d          }|j                            |          | j        z  }| j        |z  }d}|                    d          }	t          
                    ||||||	           d S )NrN   r   rQ   r~   )rw   ru   rN   r~   )rX   r   r   r   ry   r   r   r(   r'   r   rM   )
r)   r5   r   r   r   r   ru   rw   rN   r~   s
             r   rM   z%SharedWeightParameter.load_qkv_weight  s    ,,VZZ
-C-CDDOL1	KOOK00	^((33t|C
|j0JJ{++	,,%! 	- 	
 	
 	
 	
 	
r   c                     | j         D ];}t          j                            | j         |         j        d          | j         |<   <d S r   )r   rb   nnr   r   )r)   keys     r   process_weights_after_loadingz3SharedWeightParameter.process_weights_after_loading  sR    ? 	 	C#(8#5#5_S).e $6 $ $DOC  	 	r   c                      t          d          )NzAccessing `data` of a `PartitionedModelWeightParameter` is not allowed. Instead, use `get_partition` to get the weight of the particular partition you want to access)
ValueErrorr/   s    r   r   zSharedWeightParameter.data  s    :
 
 	
r   paramloaded_weight_shard_idNc                 V    t          d| j        j         d| j        j         d          )Nz"When loading partition weights of z, use methods provided by z, not partition loader)r   r   r.   )r)   r   r5   r   s       r   r   z)SharedWeightParameter._fake_weight_loader!  sD     ?~&? ?~&? ? ?
 
 	
r   )r8   r   )r.   r_   r`   ra   r   r   __annotations__r   rb   rc   dictrU   r   r   r   r*   r   r   rE   rH   rK   rM   r   rd   r   r   rV   r   rh   ri   s   @r   r   r     s          -@,?,A,A)AAA
 u|$$$$ S.::;;;;9 9 9 9 9 # s      $%3 %( % % % %0S S S S SPel P P P P
u| 
 
 
 

U\ 
 
 
 
&   
 
 X


 

 |

 !$c	D 0	

 

 

 

 

 

 

 

r   r   r   r   rl   r+   c                    t          | dd          t          | dd          '| j                                        dk    s
J d            
J d            dz   dz  
J d            dz   dz  fdt          | j                                                  D             }|                    |           |                    |           d	|v r1t          | d	          r| j        ||d	                  k    s
J d
             | j        j        | | _        t          | d          r|| _        t          | d          r|| _	        d	|v rt          | d          r|d	         | _
        | S )a
  
    Permute a parameter's layout to the specified input and output dimensions,
    useful for forcing the parameter into a known layout, for example, if I need
    a packed (quantized) weight matrix to be in the layout
        {input_dim = 0, output_dim = 1, packed_dim = 0}
    then I can call:
        permute_param_layout_(x, input_dim=0, output_dim=1, packed_dim=0)
    to ensure x is in the correct layout (permuting it to the correct layout if
    required, asserting if it cannot get it to the correct layout)
    r   Nrl   rP   z`permute_param_layout_ only supports 2D parameters when either input_dim or output_dim is not setz&either input or output dim must be setr8   c                      g | ]
}|fv|S rY   rY   ).0icurr_input_dimcurr_output_dims     r   
<listcomp>z)permute_param_layout_.<locals>.<listcomp>Q  s/       a7X.X.X.X.X.Xr   rz   z9permute_param_layout_ currently doesn't support repackingr   ro   r   )getattrr   dimrangeinserthasattrrz   permuter   ro   r   )r   r   rl   r   permr   r   s        @@r   permute_param_layout_r   .  s    UK66Ne\488O!8z~~1$$$1 %$$ **,T***)A-2))+S))))A-2
    ))**  D 	KK	>***KK
O,,,vE<((	G D)=$>>>>F ?>? $#T*EJul## %$um$$ '&v'%"?"?"<0Lr   c                     | |z  ||z  fS r2   rY   ru   rw   r   s      r    _adjust_shard_indexes_for_marlinr   h  s    ((,9I*IIIr   c                     | |z  ||z  fS r2   rY   ru   rw   r   s      r   !_adjust_shard_indexes_for_bitblasr   l  s    **L<M,MMMr   c                 n    | |z  } ||z  }|t          | ||          S |t          | ||          S | |fS )Nr   r   )r   r   r   s        r   r   r   p  ss     },J=0L#/!%-
 
 
 	

 
	&0!%/
 
 
 	
 |##r   )"collections.abcr   r   	fractionsr   weakrefr   rb   torch.nnr   vllm.distributedr   r	   vllm.loggerr
   __all__r.   loggerr   rk   r   r   r   r   r   r   r   r   r   rU   r   r   r   r   rY   r   r   <module>r     s5   / . . . . . . .       ' ' ' ' ' '               $ # # # # #	 	 	 
X		_E _E _E _E _E	 _E _E _EDH( H( H( H( H(, H( H( H(V' ' ' ' '( ' ' ':	 	 	 	 	/1A 	 	 		 	 	 	 	35E 	 	 		 	 	 	 	!5 	 	 	2( 2( 2( 2( 2(/ 2( 2( 2(j,
 ,
 ,
 ,
 ,
0 ,
 ,
 ,
^0
 0
 0
 0
 0
. 0
 0
 0
f	 	 	 	 	35E 	 	 	G
 G
 G
 G
 G
- G
 G
 G
T77),7:=77 7 7 7tJ J JN N N$ $ $ $ $r   