
    .`i(              
          d dl Z d dlmZ d dlmZ d dlZd dlmZmZ d dl	m
Z
mZmZmZmZmZ d dlmZ d dlmZ d dlmZmZ d d	lmZ d d
lmZmZmZmZmZmZm Z  d dl!m"Z" d dl#m$Z$  ee%          Z&g dZ'd Z(d Z)d Z*dede+e,e-e.e.f         f         de,de-e.e.f         fdZ/d Z0de+e,ef         fdZ1 G d de          Z2 G d de2          Z3 G d de          Z4 ej5        d           G d  d!e4                      Z6 ej5        d"           G d# d$e4                      Z7 G d% d&e7          Z8 G d' d(e7          Z9 ej5        d)           G d* d+e4                      Z:dS ),    N)abstractmethod)Any)	ParameterUninitializedParameter)divideget_tensor_model_parallel_rank$get_tensor_model_parallel_world_sizesplit_tensor_along_last_dim tensor_model_parallel_all_gather tensor_model_parallel_all_reduce)init_logger)CustomOp)QuantizationConfigQuantizeMethodBase)dispatch_unquantized_gemm)BasevLLMParameterBlockQuantScaleParameterModelWeightParameterPackedColumnParameterPackedvLLMParameterPerTensorScaleParameterRowvLLMParameter)set_weight_attrs)current_platform)UnquantizedLinearMethodCompressedTensorsLinearMethod&CompressedTensorsLinearTransformMethodBitBLASLinearMethodGPTQBitBLASLinearMethodAWQMarlinLinearMethodAWQLinearMethodGPTQMarlinLinearMethodFp8LinearMethodMarlinLinearMethodGPTQMarlin24LinearMethodTPUInt8LinearMethodGPTQLinearMethodFBGEMMFp8LinearMethodModelOptFp8LinearMethodModelOptFp8PcPtLinearMethodModelOptFp8PbWoLinearMethodIPEXAWQLinearMethodIPEXGPTQLinearMethodQuarkLinearMethodModelOptNvFp4LinearMethodPetitNvFp4LinearMethodc                 D    t          | dd           }|
||z  ||z  fS ||fS )Nbitblas_tile_sizegetattr)param
shard_sizeshard_offsetr2   s       u/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/vllm/model_executor/layers/linear.pyadjust_bitblas_shardr9   B   s=    ':DAA$//AR1RSS|##    c                 D    t          | dd           }|||fS ||z  ||z  fS )Nmarlin_tile_sizer3   )r5   r6   r7   r<   s       r8   adjust_marlin_shardr=   J   s=    u&8$??<''((,9I*IIIr:   c                 N    | J | d         }||z   dz
  |z  }||z   dz
  |z  }||fS )Nr       )weight_block_sizer6   r7   block_ns       r8   adjust_block_scale_shardrC   R   sK    ((("G 7*Q.7:Lw&*w6J|##r:   r5   shard_offsetsloaded_shard_idreturnc                 z    |d         \  }}||         \  }}| j         j        d         }||z  |z  }||z  |z  }	|	|fS )zDAdjust the quantization offsets and sizes for BitsAndBytes sharding.totalr   )datashape)
r5   rD   rE   rH   _orig_offset	orig_sizequantized_totalquantized_offsetquantized_sizes
             r8   adjust_bitsandbytes_4bit_shardrQ   Z   s[    
 W%HE1*?;Kj&q)O"_4=0E9N+++r:   c                    dddd}t          |t                    r	||         }n't          |t                    st          d|           t	          |j                  dk    r|j        d         dk    sJ |d         }| |         |fS )a  For fused modules (QKV and MLP) we have an array of length
    N that holds 1 scale for each "logical" matrix. So the param
    is an array of length N. The loaded_weight corresponds to
    one of the shards on disk. Here, we slice the param based on
    the shard_id for loading.
    r   r?      qkvzUnknown Shard Id )
isinstancestrint
ValueErrorlenrJ   )r5   loaded_weightshard_idqkv_idxss       r8   adjust_scalar_to_fused_arrayr`   i   s     QQ''H(C   9H%#&& 97X77888 =1$$"1%****%a(?M))r:   bnb_weight_attrsc                      d         }|dd         }|dd         |d         z
  }d d         d         i} fdt          dt          |          dz
            D             }t          ||          }t          ||          }||fS )	a  
    Separate the BitsAndBytes 4-bit shard.

    For example, given bnb weight attributes as below:
    {
        'bnb_shard_offsets': array([0, 4, 8, 16]),
        'bnb_quant_state': {0: ..., 1: ..., 2: ...},
    }

    The function will return:
    {
        'bnb_shard_offsets': array([0, 4]),
        'bnb_quant_state': {0: ...},
    }
    and
    {
        'bnb_shard_offsets': array([0, 4, 12]),
        'bnb_quant_state': {0: ..., 1: ...},
    }
    bnb_shard_offsetsNrS   r?   r   bnb_quant_statec                 4    i | ]}|d z
  d         |         S )r?   rd   r@   ).0ira   s     r8   
<dictcomp>z6left_shift_bitsandbytes_4bit_shard.<locals>.<dictcomp>   s;        	
A 1215  r:   )rc   rd   )ranger\   dict)ra   rD   offset_loffset_rquant_state_lquant_state_rleftrights   `       r8   "left_shift_bitsandbytes_4bit_shardrq      s    * %%89MRaR HQRR =#33H():;A>?M   q#m,,q011  M (MJJJD8]KKKE;r:   c                       e Zd ZdZedej        j        dede	e         dededej
        fd            Ze	 ddej        j        d
ej        dej        d	z  dej        fd            Zd	S )LinearMethodBasez:Base class for different (maybe quantized) linear methods.layerinput_size_per_partitionoutput_partition_sizes
input_sizeoutput_sizeparams_dtypec                     t           )a  Create weights for a linear layer.
           The weights will be set as attributes of the layer.

        Args:
            layer: The layer that is using the LinearMethodBase factory.
            input_size_per_partition: Size of the weight input dim on rank X.
            output_partition_sizes: Sizes of the output dim of each logical
                weight on rank X. E.g., output_partition_sizes for QKVLinear
                is a list contains the width of Wq, Wk, Wv on rank X.
            input_size: Size of the input dim of the weight across all ranks.
            output_size: Size of the output dim of the weight across all ranks.
            params_dtype: Datatype of the parameters.
        NotImplementedError)selfrt   ru   rv   rw   rx   ry   extra_weight_attrss           r8   create_weightszLinearMethodBase.create_weights   s
    0 "!r:   NxbiasrF   c                     t           )zwApply the weights in layer to the input tensor.
        Expects create_weights to have been called before on the layer.r{   r}   rt   r   r   s       r8   applyzLinearMethodBase.apply   s
     "!r:   N)__name__
__module____qualname____doc__r   torchnnModulerZ   listdtyper   Tensorr   r@   r:   r8   rs   rs      s        DD"x" #&" !%S		"
 " " k" " " ^"2 
 %)	" "x" <" lT!	"
 
" " " ^" " "r:   rs   c                       e Zd ZdZdej        j        dedee         dededej	        fdZ
dej        j        d	d
fdZ	 ddej        j        dej        dej        d
z  d	ej        fdZd
S )r   z#Linear method without quantization.rt   ru   rv   rw   rx   ry   c                     |                     d          }t          t          j        t	          |          ||          dd|          }	|                    d|	           t          |	|           d S )Nweight_loaderr   r?   r   )rI   	input_dim
output_dimr   weight)popr   r   emptysumregister_parameterr   )
r}   rt   ru   rv   rw   rx   ry   r~   r   r   s
             r8   r   z&UnquantizedLinearMethod.create_weights   s     +..??%*++("  
 '	
 	
 	
 	  6222!344444r:   rF   Nc                 V    t          j                    rddlm}  ||d           d S d S )Nr   )dispatch_cpu_unquantized_gemmT)remove_weight)r   is_cpu vllm.model_executor.layers.utilsr   )r}   rt   r   s      r8   process_weights_after_loadingz5UnquantizedLinearMethod.process_weights_after_loading   sN    "$$ 	EVVVVVV))%tDDDDDD	E 	Er:   r   r   c                 @     t                      |||j        |          S r   )r   r   r   s       r8   r   zUnquantizedLinearMethod.apply   s#     +(**5!U\4HHHr:   r   )r   r   r   r   r   r   r   rZ   r   r   r   r   r   r   r@   r:   r8   r   r      s        --5x5 #&5 !%S		5
 5 5 k5 5 5 5:E58? Et E E E E %)	I IxI <I lT!	I
 
I I I I I Ir:   r   c                   r     e Zd ZdZ	 	 	 	 dddddeded	ed
ej        dz  dedz  de	dedef fdZ
d Z xZS )
LinearBasea  Base linear layer.

    Args:
        input_size: input dimension of the linear layer.
        output_size: output dimension of the linear layer.
        skip_bias_add: If true, skip adding bias but instead return it.
        params_dtype: Data type for the parameters.
        quant_config: Quantization configure.
        prefix: Prefix for parameter names.
        return_bias: If true, return bias together with outputs in forward pass.
        disable_tp: If true, tensor parallelism will be disabled for this layer.
    FN Treturn_bias
disable_tprw   rx   skip_bias_addry   quant_configprefixr   r   c                   t                                                       || _        || _        || _        |t          j                    }|| _        || _        || _	        d| _
        |t                      | _        n|                    | |          | _        || _        || _        |st!                      nd| _        |st%                      nd| _        d S )NF)r   r   r?   )super__init__rw   rx   r   r   get_default_dtypery   r   r   allow_fp8_block_shape_mismatchr   quant_methodget_quant_methodr   r   r   tp_rankr	   tp_size)
r}   rw   rx   r   ry   r   r   r   r   	__class__s
            r8   r   zLinearBase.__init__
  s     	 %&* 244L((.3+;R;T;TD , = =d6 = R RD&$?IP5777qEOV;===UVr:   c                     |                                  D ]/}t          |t                    r| j        |_        | j        |_        0d S r   )
parametersrX   r   r   r   )r}   r5   s     r8   update_param_tp_statusz!LinearBase.update_param_tp_status+  sK    __&& 	- 	-E%!233 - $ $	- 	-r:   )FNNr   )r   r   r   r   rZ   boolr   r   r   rY   r   r   __classcell__r   s   @r8   r   r      s         " $+/26W ! W W WW W 	W
 kD(W )4/W W W W W W W W WB- - - - - - -r:   r   replicated_linearc                        e Zd ZdZ	 	 	 	 	 dddddeded	ed
edej        dz  dedz  de	dedef fdZ
dedej        fdZdej        dej        eej        edz  f         z  fdZde	fdZ xZS )ReplicatedLineara  Replicated linear layer.

    Args:
        input_size: input dimension of the linear layer.
        output_size: output dimension of the linear layer.
        bias: If true, add bias.
        skip_bias_add: If true, skip adding bias but instead return it.
        params_dtype: Data type for the parameters.
        quant_config: Quantization configure.
        prefix: The name of the layer in the state dict, including all parents
                        (e.g. model.layers.0.qkv_proj)
        return_bias: If true, return bias together with outputs in forward pass.
        disable_tp: Take no effect for replicated linear layers.
    TFNr   r   rw   rx   r   r   ry   r   r   r   r   c          
         t          | d          r| j        | _        n|g| _        t                                          ||||||||	           | j        J | j                            | | j        | j        | j        | j        | j	        | j
                   |rQt          t          j        | j        | j	                            | _        t          | j        d| j
        d           d S |                     dd            d S )Noutput_sizes)r   r   r   )r   r   r   r   r   r   )hasattrr   rv   r   r   r   r   rw   rx   ry   r   r   r   r   r   r   r   )r}   rw   rx   r   r   ry   r   r   r   r   r   s             r8   r   zReplicatedLinear.__init__F  sE    4(( 	8*.*;D''+6-D'#! 	 		
 		
 		
  ,,,((O'O, 	) 	
 	
 	
  	2!D,D4EFFF DI 	"#%)%7      ##FD11111r:   r5   r]   c                 :   t          |dd          }t          |dd          }|r|                                |_        |r6t          |t                    r!|                    |j        |j                   t          |j                  dk    r|	                    d          }|
                                |
                                k    s4J d|
                                 d|
                                             |j                            |           d S )	Nis_gguf_weightFis_gguf_weight_typer   r   r?   zTried to load weights of size zto a parameter of size )r4   itemweight_typerX   r   materializerJ   r   r\   reshapesizerI   copy_)r}   r5   r]   r   r   s        r8   r   zReplicatedLinear.weight_loader~  s(   
 !(8%@@%e-BEJJ 	5 - 2 2 4 4E  	Nj0FGG 	Nm19LMMM}"##q(()11!44Mzz||}11333335]-?-?-A-A 5 5&+jjll5 5 433 	
'''''r:   r   rF   c                     | j         s| j        nd }| j        J | j                            | ||          }| j        s|S | j         r| j        nd }||fS r   )r   r   r   r   r   )r}   r   r   outputoutput_biass        r8   forwardzReplicatedLinear.forward  sn     !% 2<tyy ,,,"((q$77 	M#'#5?dii4{""r:   c                 R    d| j          }|d| j         z  }|d| j        d u z  }|S )Nin_features=, output_features=, bias=)rw   rx   r   r}   ss     r8   
extra_reprzReplicatedLinear.extra_repr  sF    ,4?,,	4$"2444	.ty,...r:   )TFNNr   )r   r   r   r   rZ   r   r   r   r   rY   r   r   r   r   tupler   r   r   r   s   @r8   r   r   3  sS        * #+/2662 ! 62 62 6262 62 	62
 62 kD(62 )4/62 62 62 62 62 62 62 62 62p(9 (U\ ( ( ( (.#<# 
elI,<<=	=# # # #C        r:   r   column_parallel_linearc                        e Zd ZdZ	 	 	 	 	 	 dddddeded	ed
ededej        dz  dedz  de	dedef fdZ
ddZdedej        fdZdedej        fdZdej        eej        edz  f         z  fdZde	fdZ xZS )ColumnParallelLinearaa  Linear layer with column parallelism.

    The linear layer is defined as Y = XA + b. A is parallelized along
    its second dimension as A = [A_1, ..., A_p].

    Args:
        input_size: first dimension of matrix A.
        output_size: second dimension of matrix A.
        bias: If true, add bias.
        gather_output: If true, call all-gather on output and make Y available
                       to all GPUs, otherwise, every GPU will have its output
                       which is Y_i = XA_i
        skip_bias_add: This was added to enable performance optimizations where
                       bias can be fused with other element-wise operations. we
                       skip adding bias but instead return it.
        params_dtype: Data type for the parameters.
        quant_config: Quantization configure.
        prefix: The name of the layer in the state dict, including all parents
                        (e.g. model.layers.0.qkv_proj)
        return_bias: If true, return bias together with outputs in forward pass.
        disable_tp: If true, weights matrix won't be sharded through tp rank.
    TFNr   r   rw   rx   r   gather_outputr   ry   r   r   r   r   c	          
      V    |
st                      nd _        |
st                      nd _        | _        t          | j                   _         j        g _        t           d          r fd j	        D              _        t                                          |||||||	|
                                             | _         j        J  j                              j         j         j         j         j         j        j        j        t*          v r j        n j                   |rKt1          t3          j         j        |                     _        t9           j        d j        d           n                     d	d                                              d S )
Nr   r?   r   c                 :    g | ]}t          |j                  S r@   )r   r   rf   rx   r}   s     r8   
<listcomp>z1ColumnParallelLinear.__init__.<locals>.<listcomp>  s2     + + +6A{DL11+ + +r:   r   rt   ru   rv   rw   rx   ry   r   r   r   r   )r   r   r	   r   ru   r   output_size_per_partitionrv   r   r   r   r   %_maybe_allow_fp8_block_shape_mismatchr   r   r   rw   rx   ry   r   r   WEIGHT_LOADER_V2_SUPPORTEDweight_loader_v2r   r   r   r   r   r   r   r   )r}   rw   rx   r   r   r   ry   r   r   r   r   r   s   `          r8   r   zColumnParallelLinear.__init__  s    @JP5777qEOV;===UV(2%)/T\)J)J&'+'E&F#4(( 	+ + + +EIEV+ + +D' 	#! 	 		
 		
 		
 	22444* ,,,((%)%B#'#>(* $.7;UUU %%' 	) 	
 	
 	
  	2!D:,OOO DI 	"#%)%7     ##FD111##%%%%%r:   rF   c                    t          | dd           }t          |dd           }|+t          |          dk     st          | j                  dk    rd S 	 t          |d                   n# t          t
          f$ r Y d S w xY wdk    rd S t          fd| j        D                       r:d| _        t          	                    dt          | dd	          | j                   d S d S )
Nr   rA   r?   r   c              3   *   K   | ]}|z  d k    V  dS r   Nr@   )rf   r   rB   s     r8   	<genexpr>zMColumnParallelLinear._maybe_allow_fp8_block_shape_mismatch.<locals>.<genexpr>  s,      KKttg~"KKKKKKr:   TzDAllowing FP8 block shape mismatch for %s (block_n=%d, partitions=%s)r   z	<unknown>)
r4   r\   rv   rZ   r[   	TypeErroranyr   loggerdebug)r}   r   weight_blockrB   s      @r8   r   z:ColumnParallelLinear._maybe_allow_fp8_block_shape_mismatch  s   t^T::|-@$GG <  1$$4.//144F	,q/**GGI& 	 	 	FF	 a<<FKKKKt/JKKKKK 	26D/LLVh44+	    	 	s   A* *A?>A?r5   r]   c                    t          |dd           }t          |dd          }t          |dd          }|p|}t          |dd          }t          |dd          }|r|                                |_        |rpt          |t                    r[t          |j                  }|)||         | j        z  dk    sJ ||         | j        z  ||<   |                    ||j	                   |j
        }	|0|s.|	j        |         }
| j        |
z  }|                    |||
          }t          |j                  dk    r|                    d	          }|	j        |j        k    sJ |	                    |           d S )
Nr   is_sharded_weightFuse_bitsandbytes_4bitr   r   r   r   r?   )r4   r   r   rX   r   r   rJ   r   r   r   rI   r   narrowr\   r   r   )r}   r5   r]   r   r   r   r   r   final_shape
param_datar6   	start_idxs               r8   r   z"ColumnParallelLinear.weight_loader&  s   UL$77
#E+>FF '/F N N .F1F !(8%@@%e-BEJJ 	5 - 2 2 4 4E  	Fj0FGG 	F}233K%":.=BBBB*5j*AT\*QJ'k1DEEEZ
!*;!#)*5Jz1I)00Y
SSM }"##q(()11!44M=#66666'''''r:   c                     t          |j                  dk    r/|                                dk    sJ |                    d          }|                    |           d S Nr   r?   r]   )r\   rJ   numelr   load_column_parallel_weightr}   r5   r]   s      r8   r   z%ColumnParallelLinear.weight_loader_v2K  sg     }"##q(( &&((A----)11!44M)))FFFFFr:   c                     | j         s| j        nd }| j        J | j                            | ||          }| j        r| j        dk    rt          |          }n|}| j        s|S | j         r| j        nd }||fS )Nr?   )r   r   r   r   r   r   r   r   )r}   input_r   output_parallelr   r   s         r8   r   zColumnParallelLinear.forwardS  s     !% 2<tyy  ,,,+11$EE 	%$,"2"25oFFFF$F 	M#'#5?dii4{""r:   c                     d| j          }|d| j         z  }|d| j        d u z  }|d| j         z  }|d| j         z  }|S )Nr   r   r   
, tp_size=z, gather_output=)rw   r   r   r   r   r   s     r8   r   zColumnParallelLinear.extra_reprh  sq    ,4?,,	B$"@BBB	.ty,...	($,(((	4 2444r:   TFFNNr   )rF   N)r   r   r   r   rZ   r   r   r   r   rY   r   r   r   r   r   r   r   r   r   r   r   r   s   @r8   r   r     s        : ##+/26C& ! C& C& C&C& C& 	C&
 C& C& kD(C& )4/C& C& C& C& C& C& C& C& C&J   6#(9 #(U\ #( #( #( #(JG&7 G G G G G# 
elI,<<=	=# # # #*C        r:   r   c                        e Zd ZdZ	 	 	 	 	 	 dddddedee         d	ed
ededej        dz  de	dz  de
dedef fdZ	 ddedej        dedz  fdZdedej        fdZ	 ddedej        dedz  fdZ xZS )MergedColumnParallelLineara
  Packed linear layers with column parallelism.

    Similar to ColumnParallelLinear, but the weight matrix is concatenated
    along the output dimension. When the weight matrix is loaded, the
    different partitions are sharded separately.

    Args:
        input_size: input dimension of the linear layer.
        output_sizes: list of output dimensions of the linear layer.
        bias: If true, add bias.
        gather_output: If true, call all-gather on output and make the output
                       available to all GPUs, otherwise, every GPU will have
                       its own output.
        skip_bias_add: This was added to enable performance optimizations where
                       bias can be fused with other element-wise operations. we
                       skip adding bias but instead return it.
        params_dtype: Data type for the parameters.
        quant_config: Quantization configure.
        prefix: The name of the layer in the state dict, including all parents
                        (e.g. model.layers.0.qkv_proj)
        return_bias: If true, return bias together with outputs in forward pass.
        disable_tp: If true, all weights matrix won't be sharded, this layer
                    will be treated as a "Replicated" MergedLinear.
    TFNr   r   rw   r   r   r   r   ry   r   r   r   r   c	                    | _         |
st                      nd _        |
st                      nd _        t           fd|D                       sJ t                                          |t          |          |||||||	|

  
         d S )Nr?   r   c              3   4   K   | ]}|j         z  d k    V  dS r   )r   r   s     r8   r   z6MergedColumnParallelLinear.__init__.<locals>.<genexpr>  s/      SS{;-2SSSSSSr:   
rw   rx   r   r   r   ry   r   r   r   r   )	r   r	   r   r   r   allr   r   r   )r}   rw   r   r   r   r   ry   r   r   r   r   r   s   `          r8   r   z#MergedColumnParallelLinear.__init__  s     )EOV;===UV?IP5777qSSSSlSSSSSSSS!L))''%%#! 	 	
 	
 	
 	
 	
r:   r5   r]   rE   c                 	   t          |dd          }t          |dd          }|rf|=|j        |                                                                        |j        |<   n%fdt          | j                  D             |_        d S |rt          |dd           }                    |          | j        z  }| j	        |z  }|i
                    |||          |j                            |           t          |j                  |j        |<   |j                                       d S |j        }	t          |dd           }t          |dd          }
||?|
rt!          |	d          \  }	|	j        j        k    sJ |	                               d S d}t          |dd          }g }t          | j                  D ]"\  }}|                    |||f           ||z  }#t          |d	d           }|D ]\  }}}||k    r(||j        z  }||j        z  }t'          |||          \  }}t)          |||          \  }}|rwt+          t-          j        dg| j        z                       fd
t          | j                  D             }| j        df|d<   t3          ||t5          |                    \  }}
                    |||          }|                     |||           d S |t          | j                  k     sJ |Jt9          | j        d |                   }| j        |         }t;          |t<                    r%t          | dd           }t?          |||          \  }}|| j        z  }|| j        z  }t          |d	d           }||k    r(||j        z  }||j        z  }t'          |||          \  }}t)          |||          \  }}t          |dd          }t          |dd          }|p|}|rj        |         }j        |         |z  }|	
                    |||          }	| j	        |z  }|s
                    |||          nD|
rt!          |	|          \  }	n-t          |dd          }|st@          !                    d           |	j        j        k    sJ |	                               d S )Nr   Fr   c                 @    i | ]\  }}|                                 S r@   r   )rf   rg   rK   r]   s      r8   rh   z<MergedColumnParallelLinear.weight_loader.<locals>.<dictcomp>  s8     + + +041A}))+++ + +r:   r   needs_scalar_to_arrayr   r   
packed_dimc                 F    i | ]\  }}t          |          |         |fS r@   )rY   )rf   rg   r   indexs      r8   rh   z<MergedColumnParallelLinear.weight_loader.<locals>.<dictcomp>  s>     $ $ $#At Aq4 0$ $ $r:   rH   rA   r   ignore_warningzLoading a weight without `output_dim` attribute in MergedColumnParallelLinear, assume the weight is the same for all partitions.)"r4   rI   r   r   shard_weight_type	enumerater   r   r   r   r   r^   appendr\   data_containershard_id_mapr`   rJ   packed_factorr=   r9   r   	itertools
accumulaterx   rQ   rY   r   r   rX   r   rC   r   warning)r}   r5   r]   rE   r   r   r   r6   r   r   r	  current_shard_offsetr   rD   rg   rx   r
  r^   r7   orig_offsetsloaded_weight_shardrA   r   r  r  s     `                     @r8   r   z(MergedColumnParallelLinear.weight_loader  s    !(8%@@%e-BEJJ 	*
?+11-@@@;H;M;M;O;O'88+ + + +8A$BS8T8T+ + +' F 
	 d;;J&++J774<GJz1I* - 4 4ZJ W W%%o66669%:N6O6O"?3$++M:::Z
UL$77
 '/F N N" !( 0L"M11 1-J "'=+>>>>>  ///#$ $+E3JE$R$R!8:M"+D,=">"> 4 4;$$a)={%KLLL$3$$ d;;J6C I I2,
 ++!+u/B!BJ#/53F#FL/Bz<0 0,J ,@:|, ,(
L ) 	 !5qcD<M6M!N!NOOE$ $ $ $'01B'C'C$ $ $L .2-=q,AL)/M|S]]0 0,J '4&:&:j' '# ""5*=xHHHHFT%6!7!77777!t01A/1ABCCL*?;J%!9:: $+D2Et$L$L!+C%z<, ,(
L T\)L4<'J
 !d;;JZ'''5+>>
+u/BB+>:|, ,(
L (<z<( ($J %,E3JE$R$R! '/BE J J !2 J5J$ Q*0<
,2:>P#**:|ZPPJz1I$ X - 4 4ZJ W W" 	(DM?) )%J
 %U,<eDDN! 3   =#66666'''''r:   c                    d}g }t          | j                  D ]"\  }}|                    |||f           ||z  }#|D ]\  }}}	t          |t          t
          f          r*|j        |j        k    r|                    |	|          \  }	}|	                    |j        ||	          }
| 
                    ||
|           dS )a  
        Handle special case for models where MLP layers are already
        fused on disk. In this case, we have no shard id. This function
        determines the shard id by splitting these layers and then calls
        the weight loader using the shard id.

        An example of a model with these fused layers:
        https://huggingface.co/microsoft/Phi-3-mini-4k-instruct
        r   r6   r7   N)r  r   r  rX   r   r   r
  r    adjust_shard_indexes_for_packingr   r   )r}   r5   r]   r  rD   rg   rx   r^   r7   r6   r  s              r8   "_load_fused_module_from_checkpointz=MergedColumnParallelLinear._load_fused_module_from_checkpointA  s     !46'(9:: 	0 	0NA{  !%9;!GHHH K/  2? 	H 	H.HlJ
 5#8:M"NOO$(888+0+Q+Q) ,R , ,(
L #0"6"6 ,
# # !!%)<hGGGG	H 	Hr:   c                 `   |{t          |t                    r|                    |d           d S t          |          t          t
          fv r|                    |           d S |                     ||           d S |t          | j                  k     sJ t          | j        d |                   }| j        |         }t          |t                    r%t          | dd           }t          |||          \  }}|| j        z  }|| j        z  }|                    ||||| j                   d S )Nr   )r]   r^   r   rA   )r]   r^   r7   r6   r   )rX   r   load_merged_column_weighttyper   r   r  r\   r   r   r   r4   rC   r   r   r}   r5   r]   rE   r7   r6   rA   s          r8   r   z+MergedColumnParallelLinear.weight_loader_v2e  sb    "%!899 //mVW/XXXe!13D EEE//m/LLL33E=IIIFT%6!7!777774,-=o-=>??&7
e566 	 '.A4 H H'?!:|( ($J 	%t|#
'''$%!L 	( 	
 	
 	
 	
 	
r:   r   r   )r   r   r   r   rZ   r   r   r   r   r   rY   r   r   r   r   r   r  r   r   r   s   @r8   r  r  q  s        : ##+/26
 ! 
 
 

 3i
 	

 
 
 kD(
 )4/
 
 
 
 
 
 
 
 
H '+	T( T(T( |T( t	T( T( T( T(l"H&"H7<|"H "H "H "HP '+	%
 %
 %
 |%
 t	%
 %
 %
 %
 %
 %
 %
 %
r:   r  c                       e Zd ZdZ	 	 	 	 	 	 ddddddeded	ed
edz  dededej        dz  dedz  de	dedededz  f fdZ
de	fdZde	fdZdedej        fdZ	 ddedej        de	dz  fdZ	 ddedej        de	dz  fdZ xZS )QKVParallelLineara  Linear layers for the attention's QKV transformation.

    Linear layers for the linear transformation of the query, key, and value
    vectors in the attention layer. The weight matrix is concatenated along
    the output dimension. The layer is parallelized along the head dimension.
    When the number of key/value heads is smaller than the number of query
    heads (e.g., multi-query/grouped-query attention), the key/value head may
    be replicated while the query heads are partitioned.

    Args:
        hidden_size: input hidden state size of the transformer.
        head_size: size of each attention head.
        total_num_heads: total number of attention query heads.
        total_num_kv_heads: total number of attention key/value heads. If
                            None, assume total_num_kv_heads = total_num_heads.
        bias: If true, add bias.
        skip_bias_add: This was added to enable performance optimizations where
                       bias can be fused with other element-wise operations. we
                       skip adding bias but instead return it.
        params_dtype: Data type for the parameters.
        quant_config: Quantization configure.
        prefix: The name of the layer in the state dict, including all parents
                        (e.g. model.layers.0.qkv_proj)
        return_bias: If true, return bias together with outputs in forward pass.
        disable_tp: If true, weights matrix won't be sharded through tp rank.
    NTFr   )r   r   v_head_sizehidden_size	head_sizetotal_num_headstotal_num_kv_headsr   r   ry   r   r   r   r   r$  c
                   || _         || _        ||n|| _        || _        ||}|| _        |st                      nd}t          | j        |          | _        || j        k    r"d| _        t          || j                  | _	        n!t          | j        |          | _        d| _	        | j         }| j        | j        z  | j        | j        z  z   | j        | j        z  z   |z  }| j        | j        z  |z  | j        | j        z  |z  | j        | j        z  |z  g| _
        t                                          |||d||||	|
|
  
         d S )Nr?   Fr  )r%  r&  r$  r'  r(  r	   r   	num_headsnum_kv_headsnum_kv_head_replicasr   r   r   )r}   r%  r&  r'  r(  r   r   ry   r   r   r   r   r$  r   rw   rx   r   s                   r8   r   zQKVParallelLinear.__init__  s     '"*5*A;;y.%!0"4@JQ6888PQ 4g>>d--- !D(.w8O(P(PD%% &t'> H HD()D%%
NT^+$.01$"223 	 NT^+g5.8 007:
 	!#'%%#! 	 	
 	
 	
 	
 	
r:   rE   c                     d| j         | j        z  | j         | j        z   | j        z  | j         | j        z   | j        z  | j        | j        z  z   d}|                    |          S )Nr   rU   rV   rW   rH   r*  r&  r+  r$  get)r}   rE   shard_offset_mappings      r8   _get_shard_offset_mappingz+QKVParallelLinear._get_shard_offset_mapping  so    $.0.4#44Fnt'88DNJ$"223	 
  
 $''888r:   c                     | j         | j        z  | j        | j        z  | j        | j        z  d}|                    |          S )NrT   r/  )r}   rE   shard_size_mappings      r8   _get_shard_size_mappingz)QKVParallelLinear._get_shard_size_mapping  sL    $.0"T^3"T%55
 

 "%%o666r:   r5   r]   c                    dd| j         | j        z  fd| j         | j        z  | j        | j        z  fd| j         | j        z   | j        z  | j        | j        z  fg}|D ]\  }}}t	          |t
          t          f          r*|j        |j        k    r|	                    ||          \  }}|
                    |j        ||          }|                     |||           dS )a  
        Handle special case for models where QKV layers are already
        fused on disk. In this case, we have no shard id. This function
        determines the shard id by splitting these layers and then calls
        the weight loader using the shard id.

        An example of a model with these fused layers:
        https://huggingface.co/microsoft/Phi-3-mini-4k-instruct
        rU   r   rV   rW   r  N)r'  r&  r(  r$  rX   r   r   r
  r   r  r   r   )r}   r5   r]   rD   r^   r7   r6   r  s           r8   r  z4QKVParallelLinear._load_fused_module_from_checkpoint  s&    !T)DN:;$t~5'$.8 %(??4>Q'$*::
 3@ 	H 	H.HlJ
 5#8:M"NOO$(888+0+Q+Q) ,R , ,(
L #0"6"6 ,
# # !!%)<hGGGG	H 	Hr:   c                 6   |t          |t                    r|                    |d| j                   d S t	          |          t
          t          fv r|                    || j                   d S |                     ||           d S |dv sJ |                     |          }| 	                    |          }t          |t                    r%t          | dd           }t          |||          \  }}|                    || j        |||| j                   d S )Nr   )r]   r^   r   )r]   r   rT   rA   )r]   r*  r^   r7   r6   r   )rX   r   load_qkv_weightr   r   r   r   r  r2  r5  r   r4   rC   r,  r!  s          r8   r   z"QKVParallelLinear.weight_loader_v2   sU    "%!899 %%"/!T\ &    e!13D EEE%%M4<%XXX33E=IIIF/111155oFF11/BB
e566 	 '.A4 H H'?!:|( ($J 	'/$%!L 	 	
 	
 	
 	
 	
r:   c                 
   t          |dd          }t          |dd          }|r`dddd}|C|j        ||                                                                                 |j        |<   nfd|D             |_        d S |rt          |d	d           }                    |          | j        z  }| j        |z  }	|i                    ||	|          |j	        
                    |           t          |j                  |j        |<   |j        
                               d S |j        }
t          |d	d           }t          |d
d          }||?|rt          |
d          \  }
|
j        j        k    sJ |
                               d S dd| j        | j        z  fd| j        | j        z  | j        | j        z  fd| j        | j        z   | j        z  | j        | j        z  fg}t          |dd          }t          |dd           }|D ]\  }}}||k    r(||j        z  }||j        z  }t+          |||          \  }}|rd| j        | j        z  f| j        | j        z  | j        | j        z  f| j        | j        z   | j        z  | j        | j        z  f| j        | j        z   | j        z  | j        | j        z  z   dfd}t-          |||          \  }}                    |||          }|                     |||           d S |dv sJ ||dk    rd}| j        | j        z  }nQ|dk    r| j        | j        z  }| j        | j        z  }n,|dk    r&| j        | j        z   | j        z  }| j        | j        z  }t5          |t6                    r%t          | dd           }t9          |||          \  }}t          |dd           }||k    r(||j        z  }||j        z  }t+          |||          \  }}t          |dd          }t          |dd          }|p|}|rd| j        | j        z  f| j        | j        z  | j        | j        z  f| j        | j        z   | j        z  | j        | j        z  f| j        | j        z   | j        z  | j        | j        z  z   dfd}t-          |||          \  }}|
                    |||          }
|dk    r| j        }n| j        | j        z  }||z  }	|s                    ||	|          nD|rt          |
|          \  }
n-t          |dd          }|st<                              d           |
j        j        k    sJ |
                               d S )Nr   Fr   r   r?   rS   rT   c                 :    i | ]}|                                 S r@   r  )rf   rV   r]   s     r8   rh   z3QKVParallelLinear.weight_loader.<locals>.<dictcomp>W  s'    *T*T*Tq1m.@.@.B.B*T*T*Tr:   r   r	  rU   rV   rW   r   r
  r.  rA   r   r  zwLoading a weight without `output_dim` attribute in QKVParallelLinear, assume the weight is the same for all partitions.) r4   rI   r   r   r  r   r   r   r   r^   r  r\   r  r  r`   rJ   r'  r&  r(  r$  r  r=   rQ   r   r*  r+  rX   r   rC   r,  r   r  )r}   r5   r]   rE   r   r   idx_mapr   r6   r   r   r	  rD   r   r
  r^   r7   orig_qkv_offsetsr  rA   r   
shard_rankr  s     `                    r8   r   zQKVParallelLinear.weight_loaderG  s    !(8%@@%e-BEJJ 	AA..G*
7?34::=III;H;M;M;O;O'88*T*T*T*TG*T*T*T'F 
	 d;;J&++J774<GJz1I* - 4 4ZJ W W%%o66669%:N6O6O"?3$++M:::Z
UL$77
 !(/F N N" !( 0L"M11 1-J "'=+>>>>>  /// a->?(4>9+dn< )D,CCt~U+d.>>M %,E3JE$R$R! d;;J6C (I (I2,
 ++!+u/B!BJ#/53F#FL 0Cz<0 0,J ) !5!FG 04>A 3dnD
 "1D4KK"n- 3d6FF "1D4KK"n-"58HHI 	"( ($& 0N/0 0,J '4&:&:j' '# ""5*=xHHHHF/1111 !#%% !^dn<

 C''#~>!.?

 C'' $1B BdnT!.1AA
%!9:: $+D2Et$L$L!+C%z<, ,(
L !d;;JZ'''5+>>
+u/BB ,?:|, ,(
L %,E3JE$R$R! '/BE J J !2 J5J$ T^dn<=7)DN:
 $*;;t~M)D,<<
 $*;;t~M+d.>>?$ $   ,J+_, ,(
L $**:|ZPPJ#%%!\

!\T-FF
"Z/I$ X - 4 4ZJ W W # 	(DM?) )%J %U,<eDDN! *   =#66666'''''r:   )NTFNNr   r   )r   r   r   r   rZ   r   r   r   r   rY   r   r2  r5  r   r   r  r   r   r   r   r   s   @r8   r#  r#    s        @ *.#+/267
 ! "&7
 7
 7
7
 7
 	7

  $J7
 7
 7
 kD(7
 )4/7
 7
 7
 7
 4Z7
 7
 7
 7
 7
 7
r9 9 9 9 97s 7 7 7 7*H&*H7<|*H *H *H *H` '+	%
 %
 %
 |%
 t	%
 %
 %
 %
V '+	B( B(B( |B( t	B( B( B( B( B( B( B( B(r:   r#  row_parallel_linearc                        e Zd ZdZ	 	 	 	 	 	 	 dddddeded	ed
ededej        dz  dededz  de	dedef fdZ
dedej        fdZdedej        fdZdej        eej        edz  f         z  fdZde	fdZ xZS )RowParallelLineara  Linear layer with row parallelism.

    The linear layer is defined as Y = XA + b. A is parallelized along
    its first dimension and X along its second dimension as:
               -   -
              | A_1 |
              | .   |
          A = | .   |        X = [X_1, ..., X_p]
              | .   |
              | A_p |
               -   -
    Arguments:
        input_size: first dimension of matrix A.
        output_size: second dimension of matrix A.
        bias: If true, add bias. Note that bias is not parallelized.
        input_is_parallel: If true, we assume that the input is already
                           split across the GPUs and we do not split
                           again.
        skip_bias_add: This was added to enable performance optimization where
                       bias can be fused with other element-wise operations.
                       We skip adding bias but instead return it.
        params_dtype: Data type for the parameters.
        reduce_results: If true, call all-reduce on output and make Y available
                       to all GPUs, otherwise, every GPU will have its output
                       which is Y = X_iA_i
        quant_config: Quantization configure.
        prefix: The name of the layer in the state dict, including all parents
                        (e.g. model.layers.0.down_proj)
        return_bias: If true, return bias together with outputs in forward pass.
        disable_tp: If true, weights matrix won't be sharded through tp rank.
    TFNr   r   rw   rx   r   input_is_parallelr   ry   reduce_resultsr   r   r   r   c
          
      
   |st                      nd| _        |st                      nd| _        t	          || j                  | _        || _        |g| _        t                      	                    ||||||	|
|           || _
        || _        | j        J | j                            | | j        | j        | j        | j        | j        | j        j        j        t&          v r| j        n| j                   |s|r|st-          d          |rKt/          t1          j        | j        |                    | _        t7          | j        d| j        d           n|                     dd            |                                  d S )	Nr   r?   r   r   zUWhen not reduce the results, adding bias to the results can lead to incorrect resultsr   r   r   )r   r   r	   r   r   ru   r   rv   r   r   rA  rB  r   r   rw   rx   ry   r   r   r   r   r   r[   r   r   r   r   r   r   r   )r}   rw   rx   r   rA  r   ry   rB  r   r   r   r   r   s               r8   r   zRowParallelLinear.__init__1  s     @JP5777qEOV;===UV(.z4<(H(H%)4&'2m##! 	 		
 		
 		
 "3, ,,,((%)%B#'#>(* $.7;UUU %%' 	) 	
 	
 	
  	4 	 	8  
  
	2!%+d.>l"S"S"STTDI	"#%)%7     ##FD111##%%%%%r:   r5   r]   c                    t          |dd           }t          |dd          }t          |dd          }|p|}t          |dd          }t          |dd          }|r|                                |_        |rgt          |t                    rRt          |j                  }|r||         | j        z  ||<   |                    t          |          |j
                   |j        }	|0|s.|	j        |         }
| j        |
z  }|                    |||
          }t          |j                  dk    r|                    d	          }|	j        |j        k    sJ |	                    |           d S )
Nr   r   Fr   r   r   r   r   r?   )r4   r   r   rX   r   r   rJ   r   r   r   r   rI   r   r   r\   r   r   )r}   r5   r]   r   r   r   r   r   weight_shaper   r6   r   s               r8   r   zRowParallelLinear.weight_loaderv  s   E;55	 '/F N N#E+>FF .F1F !(8%@@%e-BEJJ 	5 - 2 2 4 4E  	Nj0FGG 	N 344L R*6y*AT\*QY'eL119LMMMZ
 ): #))4Jz1I)00IzRRM }"##q(()11!44M=#66666'''''r:   c                     t          |j                  dk    r/|                                dk    sJ |                    d          }|                    |           d S r   )r\   rJ   r   r   load_row_parallel_weightr   s      r8   r   z"RowParallelLinear.weight_loader_v2  sg     }"##q(( &&((A----)11!44M&&]&CCCCCr:   rF   c                    | j         r|}n5t          || j                  }|| j                                                 }| j        J | j        dk    s| j        rd n| j        }| j                            | ||          }| j	        r| j        dk    rt          |          }n|}| j        s|S | j        r| j        nd }||fS )N)num_partitionsr   r?   )rA  r
   r   r   
contiguousr   r   r   r   rB  r   r   )r}   r   input_parallelsplitted_inputbias_r   r   r   s           r8   r   zRowParallelLinear.forward  s     ! 	G#NN8t|  N ,DL9DDFFN  ,,, ))T-?)di+11$NN 	%4<!#3#35oFFFF$F 	M#'#5?dii4{""r:   c                     d| j          }|d| j         z  }|d| j        d u z  }|d| j         z  }|d| j         z  }|S )Nr   r   r   r   z, reduce_results=)ru   rx   r   r   rB  r   s     r8   r   zRowParallelLinear.extra_repr  sr    :48::	4$"2444	.ty,...	($,(((	6!4666r:   )TTFNTNr   )r   r   r   r   rZ   r   r   r   r   rY   r   r   r   r   r   r   r   r   r   r   r   s   @r8   r@  r@    s        L "&#+/#26C& ! C& C& C&C& C& 	C&
  C& C& kD(C& C& )4/C& C& C& C& C& C& C& C& C&J!(9 !(U\ !( !( !( !(FD&7 D D D D D# 
elI,<<=	=# # # #:C        r:   r@  );r  abcr   typingr   r   torch.nn.parameterr   r   vllm.distributedr   r   r	   r
   r   r   vllm.loggerr   vllm.model_executor.custom_opr   3vllm.model_executor.layers.quantization.base_configr   r   r   r   vllm.model_executor.parameterr   r   r   r   r   r   r   vllm.model_executor.utilsr   vllm.platformsr   r   r   r   r9   r=   rC   rj   rY   r   rZ   rQ   r`   rq   rs   r   r   registerr   r   r  r#  r@  r@   r:   r8   <module>rZ     sE                    @ @ @ @ @ @ @ @                $ # # # # # 2 2 2 2 2 2        G F F F F F                  7 6 6 6 6 6 + + + + + +	X		   4$ $ $J J J$ $ $,,%)#uS#X*>%?,RU,
38_, , , ,* * *2c3h    D&" &" &" &" &") &" &" &"R,I ,I ,I ,I ,I. ,I ,I ,I^3- 3- 3- 3- 3- 3- 3- 3-n &''s s s s sz s s ('sn +,,B B B B B: B B -,BJY
 Y
 Y
 Y
 Y
!5 Y
 Y
 Y
x|( |( |( |( |(, |( |( |(@ ())w w w w w
 w w *)w w wr:   