
    .`iY                        d dl Z d dlmZ d dlmZ d dlmZ d dlmZ d dl	m
Z
 d dlmZmZmZ d dlmZ dd	lmZ dd
lmZmZ ddZ G d de          Z G d de          Z G d de          Z G d de          Z G d de          Z G d de          Z G d de          Z G d de          Z G d de          ZdS )     N)PretrainedConfig)
LoRAConfig) tensor_model_parallel_all_gather)divide)ColumnParallelLinearMergedColumnParallelLinearQKVParallelLinear)current_platform   )BaseLinearLayerWithLoRA)_fully_sharded_can_replace_not_fully_sharded_can_replacelayerColumnParallelLinearWithLoRAc                 F   |j         t          |j                  cxk    r1t          |j                  cxk    rt          |j                  k    sn J |j        j                            |j        | |          }|                     d| j	        d                   } |                    d|j	        d                   |j	        }}t          j        |j         | j	        d         |j        d         j	        d         ft          j        | j                  }|j                            || |j        d          }t!          j                    s|}t%          |          }|j                            |||j        |j        dd          }t!          j                    s|} |j        | }|S )z
    For `ColumnParallelLinearWithLoRA` or classes that inherit from
    `ColumnParallelLinearWithLoRA`, they share the same `apply` logic.
    r      dtypedeviceg      ?T)offset_start	add_input)n_sliceslenlora_a_stackedlora_b_stackedoutput_slices
base_layerquant_methodapplyviewshapetorchzerosfloat32r   punica_wrapper
add_shrinkr
   can_update_inplacer   
add_expand)xbiasr   outputout_orig_shapebuffersshrunk_bufferslora_outputs           {/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/vllm/lora/layers/column_parallel_linear.py
_mcp_applyr2      s    	u#$$	$ 	$ 	$ 	$u#$$	$ 	$ 	$ 	$ u"##	$ 	$ 	$ 	$ 	$ 	$ *001A1dKKF	r172;A#[[V\"-=>>NF k	U%9!%<%B1%EFmx  G +0*>*I*IE(#+ +N .00 ! .w77G',';'F'F (G ( (K .00 V[.)FM    c                       e Zd ZdZdeddf fdZdej        dej        fdZdej        dej        fd	Z	d
ej        dej        e
ej        ej        dz  f         z  fdZee	 ddej        dedededz  def
d                        Z xZS )r   a$  
    LoRA on top of ColumnParallelLinear layer.
    LoRA B is sliced for tensor parallelism.
    There are two types for the `base_layer`:
    1. ColumnParallelLinear, e.g.`dense_h_to_4h` in `FalconForCausalLM`.
    2. MergedColumnParallelLinear, e.g.`gate_up_proj` in `Phi3ForCausalLM`.
    r   returnNc                     t                                          |           t          |          t          u | _        | j        j        | _        d| _        d S Nr   )	super__init__typer   is_merged_col_linearr   output_size_per_partitionoutput_sizer   selfr   	__class__s     r1   r9   z%ColumnParallelLinearWithLoRA.__init__S   sK    $$$ %)$4$48R$R!?Dr3   lora_ac                     |S N r?   rA   s     r1   slice_lora_az)ColumnParallelLinearWithLoRA.slice_lora_a]   s    r3   lora_bc                 b   | j         r{| j        dz  }|j        d         dz  }|| j        |z  | j        dz   |z  d d f         }||| j        |z  z   || j        dz   |z  z   d d f         }t	          j        ||gd          }n,| j        }| j        |z  }| j        dz   |z  }|||d d f         }|S )Nr   r   r   dim)r;   r=   r"   tp_rankr#   cat)r?   rG   
shard_sizeoffsetleft_weightright_weight	start_idxend_idxs           r1   slice_lora_bz)ColumnParallelLinearWithLoRA.slice_lora_b`   s     $ 	2)Q.J\!_)F z)T\A-=,KKQQQNK "
22V<!#z162 2L
 Y\:BBBFF )Jz1I|a':5GIg-qqq01Fr3   input_c                    | j         j        s| j         j        nd}|                     ||          }| j         j        r| j        dk    rt          |          }n|}| j         j        s|S | j         j        r| j         j        nd}||fS )zForward of ColumnParallelLinear

        Args:
            input_: Tensor whose last dimension is `input_size`.

        Returns:
            - output
            - bias
        Nr   )r   skip_bias_addr+   r    gather_outputtp_sizer   return_bias)r?   rT   r+   output_parallelr,   output_biass         r1   forwardz$ColumnParallelLinearWithLoRA.forwardy   s     ,0?+HRt##d **VT22?( 	%T\A-=-=5oFFFF$F* 	M.2o.KUdo**QU{""r3   source_layerlora_configpacked_modules_listmodel_configc                     t          |          t          u rdS t          |          t          u r>t          |          dk    rdS t	          |d          ot          |j                  dk     S dS )NTr   Foutput_sizes   )r:   r   r   r   hasattrrb   clsr]   r^   r_   r`   s        r1   can_replace_layerz.ColumnParallelLinearWithLoRA.can_replace_layer   s     !5554!;;;&''1,,u
 n55 8122a7  ur3   rC   )__name__
__module____qualname____doc__r   r9   r#   TensorrF   rS   tupler\   classmethodr   nnModuler   listr   boolrg   __classcell__r@   s   @r1   r   r   J   sN        #7 D      5< EL    5< EL    2#l#	elEL4,??@	@# # # #8 # 15 i   "	
 '- 
   $# [    r3   c                       e Zd ZdZdeez  ddf fdZ	 ddedede	dz  ddfd	Z
d
eej        dz           deej        dz           fdZdeej        dz           deej        dz           fdZded
ej        eej                 z  dej        eej                 z  fdZee	 ddej        dedede	dz  def
d                        Z xZS )"MergedColumnParallelLinearWithLoRAzColumnParallelLinear layer that is composed of 2 sublayers (slices)
    packed together (e.g. gate_proj + up_proj -> gate_up_proj).

    This means we have 2 LoRAs, each applied to one half of the layer.

    Both slices must have the same size.
    r   r5   Nc                      t                                          |            j        j        }t	           fd|D                        _        t           j                   _         j        f j        z   _	        d S )Nc              3   B   K   | ]}t          |j                  V  d S rC   )r   rX   ).0r=   r?   s     r1   	<genexpr>z>MergedColumnParallelLinearWithLoRA.__init__.<locals>.<genexpr>   s@       #
 #
2=F;--#
 #
 #
 #
 #
 #
r3   )
r8   r9   r   rb   rm   r   r   r   rK   
output_ids)r?   r   rb   r@   s   `  r1   r9   z+MergedColumnParallelLinearWithLoRA.__init__   s     	$$$ 3" #
 #
 #
 #
AM#
 #
 #
 
 
 D.//</DM9r3   	max_lorasr^   r`   c                 $     _         j        sj        nt          j         j                  t           fdt           j                  D                        _        t           fd j	        D                        _
        dS )zk
        The main reason for overriding this function is to enhance  code
        maintainability.
        c           	   3   j   K   | ]-}t          j        d j        j        j                  V  .dS r   r   N)r#   r$   
input_size
lora_dtyper   )ry   _ lora_a_output_size_per_partitionr^   r|   r?   s     r1   rz   zIMergedColumnParallelLinearWithLoRA.create_lora_weights.<locals>.<genexpr>   sc       
$
 
$
  K0!,{  
$
 
$
 
$
 
$
 
$
 
$
r3   c           	   3   j   K   | ]-}t          j        d |j        j        j                  V  .dS r   )r#   r$   max_lora_rankr   r   )ry   r=   r^   r|   r?   s     r1   rz   zIMergedColumnParallelLinearWithLoRA.create_lora_weights.<locals>.<genexpr>   sd       
$
 
$
  K)!,{  
$
 
$
 
$
 
$
 
$
 
$
r3   N)r^   fully_sharded_lorasr   r   rX   rm   ranger   r   r   r   )r?   r|   r^   r`   r   s   ``` @r1   create_lora_weightsz6MergedColumnParallelLinearWithLoRA.create_lora_weights   s     ' 2AK%%14<@@ 	) $ 
$
 
$
 
$
 
$
 
$
 
$
 
$
 4=))
$
 
$
 
$
 

 

 $ 
$
 
$
 
$
 
$
 
$
 
$
  $1
$
 
$
 
$
 

 

r3   rA   c                     |S rC   rD   rE   s     r1   rF   z/MergedColumnParallelLinearWithLoRA.slice_lora_a   s	     r3   rG   c                     d g| j         z  }t          t          | j        | j                            D ],\  }\  }}||         x}|||z  ||dz   z  d d f         ||<   -|S r7   )r   	enumeratezipr{   r   )r?   rG   sliced_lora_bishard_idrM   lora_b_is          r1   rS   z/MergedColumnParallelLinearWithLoRA.slice_lora_b   s     .)2!344*
 *
 	 	%A%* #1I%2#+)J(Q,,GGJ$a  r3   indexc                    |                      |           | j        dk    r*|                     |          }|                     |          }t	          | j                  D ]}||         x}F| j        |         |dd |j        d         d |j        d         f                             |d           ||         x}F| j	        |         |dd |j        d         d |j        d         f                             |d           d S )Nr   r   T)non_blocking)

reset_lorarX   rF   rS   r   r   r   r"   copy_r   )r?   r   rA   rG   r   lora_a_ir   s          r1   set_loraz+MergedColumnParallelLinearWithLoRA.set_lora   s    	<!&&v..F&&v..Ft}%% 	5 	5A"1I%2#A&11q 113FX^A5F3FF%t%444"1I%2#A&11q 113FX^A5F3FF%t%444	5 	5r3   r]   r_   c                 T    t          |          t          u ot          |          dk    S )Nr   )r:   r   r   re   s        r1   rg   z4MergedColumnParallelLinearWithLoRA.can_replace_layer  s0     "<< .'((A-	
r3   rC   )rh   ri   rj   rk   r   r	   r9   intr   r   r   rq   r#   rl   rF   rS   r   rn   r   ro   rp   rr   rg   rs   rt   s   @r1   rv   rv      s        :47HH:	: : : : : :$ 15	'
 '
'
  '
 '-	'

 
'
 '
 '
 '
R5<$./	elT!	"   
5<$./	elT!	"   55 tEL115 tEL11	5 5 5 5, # 15

 

i

  

 "	


 '-

 


 

 

 $# [

 

 

 

 

r3   rv   c                        e Zd ZdZdeddf fdZdej        dej        fdZe	e
	 ddej        d	ed
ededz  def
d                        Z xZS )QKVParallelLinearWithLoRAa  
    ColumnParallelLinear layer that is specifically designed for
    qkv_proj. Certain models, such as chatglm3 and baichuan-7b,
    only contains a single LoRA within their qkv_proj layer.

    During inference with Tensor Parallel, the weights of lora_b
    must be accurately partitioned according to the respective ranks.

    Q slice may have different shape than K and V slices (which both have
    the same shape).
    r   r5   Nc                 H   t                                          |           | j        j        | j        j        z  | _        | j        j        | j        j        z  | _        | j        j        | j        j        z  | _	        | j        j
        | j        j        z  | _        d| _        d S r7   )r8   r9   r   total_num_heads	head_sizeq_proj_total_size	num_headsq_proj_shard_sizenum_kv_headskv_proj_shard_sizetotal_num_kv_headskv_proj_total_sizer   r>   s     r1   r9   z"QKVParallelLinearWithLoRA.__init__1  s    $$$O+do.GG 	 "&!:T_=V!VO(4?+DD 	 O.1JJ 	 r3   rG   c                    | j         | _        | j         | j        j        z  | _        || j        | j        z  | j        | j        dz   z  d d f         }| j        }||| j        | j        z  z   || j        | j        dz   z  z   d d f         }|| j        z   }||| j        | j        z  z   || j        | j        dz   z  z   d d f         }t          j
        |||gd          }|S )Nr   r   rI   )rK   
q_shard_idr   num_kv_head_replicaskv_shard_idr   r   r   r   r#   rL   )r?   rG   lora_b_qk_offsetlora_b_kv_offsetlora_b_vs          r1   rS   z&QKVParallelLinearWithLoRA.slice_lora_b@  s(   ,<4?+OO"T_4t7M"8$ $AA

 )t.1AAAH%)9A)=>E? ?AA

 d55t.1AAAH%)9A)=>E? ?AA

 Hh9qAAAr3   r]   r^   r_   r`   c                 T    t          |          t          u ot          |          dk    S r7   r:   r	   r   re   s        r1   rg   z+QKVParallelLinearWithLoRA.can_replace_layerW  ,     L!!%66X3?R;S;SWX;XXr3   rC   )rh   ri   rj   rk   r	   r9   r#   rl   rS   rn   r   ro   rp   r   rq   r   rr   rg   rs   rt   s   @r1   r   r   $  s        
 
#4       5< EL    . # 15Y YiY  Y "	Y
 '-Y 
Y Y Y $# [Y Y Y Y Yr3   r   c                        e Zd ZdZdeddf fdZ	 ddedededz  ddf fd	Z	e
e	 dd
ej        dedededz  def
d                        Z xZS )MergedQKVParallelLinearWithLoRAaK  MergedColumnParallelLinear layer that is composed of 3 sublayers (slices)
    packed together in qkv proj fashion
    (q_proj + k_proj + v_proj -> qkv_proj).

    This means we have 3 LoRAs, each applied to one slice of the layer.

    Q slice may have different shape than K and V slices (which both have
    the same shape).
    r   r5   Nc                    t                                          |           t          | j        j                  | _        | j        j        | j        j        z  | _        | j        j	        | j        j        z  | _
        | j        | _        | j        | j        j        z  | _        | j        | j
        | j
        f| _        | j        | j        | j        f| _        d S rC   )r8   r9   r   r   rb   r   r   r   r   r   r   rK   r   r   r   r   r{   r>   s     r1   r9   z(MergedQKVParallelLinearWithLoRA.__init__n  s    $$$DO899!%!:T_=V!VO(4?+DD 	 ,<4?+OO "##
 O
r3   r|   r^   r`   c                 N    t                                          |||           dS )z
        The main reason for overloading this function is to handle inconsistent
        weight dimensions in qkv lora.
        N)r8   r   )r?   r|   r^   r`   r@   s       r1   r   z3MergedQKVParallelLinearWithLoRA.create_lora_weights  s'     	##I{LIIIIIr3   r]   r_   c                 T    t          |          t          u ot          |          dk    S )Nrc   r   re   s        r1   rg   z1MergedQKVParallelLinearWithLoRA.can_replace_layer  r   r3   rC   )rh   ri   rj   rk   r	   r9   r   r   r   r   rn   r   ro   rp   rq   rr   rg   rs   rt   s   @r1   r   r   c  s.        
#4 
 
 
 
 
 
 
6 15	
J 
J
J  
J '-	
J
 

J 
J 
J 
J 
J 
J # 15Y YiY  Y "	Y
 '-Y 
Y Y Y $# [Y Y Y Y Yr3   r   c                        e Zd ZdZdej        dej        fdZddej        dej        dz  dej        fdZee		 dd	e
j        d
edededz  def
 fd                        Z xZS )#ColumnParallelLinearWithShardedLoRAz
    Differs from ColumnParallelLinearWithLoRA by slicing LoRA A also.

    Based on S-LoRA, slicing happens along the rank dim.
    rA   r5   c                 l    | j         d         j        d         }| j        |z  }||||z   d d f         }|S Nr   r   r   r"   rK   r?   rA   rM   rQ   s       r1   rF   z0ColumnParallelLinearWithShardedLoRA.slice_lora_a  E    (+1!4
L:-		I
$::AAA=>r3   Nr*   r+   c                 $    t          |||           S rC   r2   r?   r*   r+   s      r1   r    z)ColumnParallelLinearWithShardedLoRA.apply      !T4(((r3   r]   r^   r_   r`   c                 P    t                                          ||||d          S NF)r]   r^   r_   r`   decorater8   rg   rf   r]   r^   r_   r`   r@   s        r1   rg   z5ColumnParallelLinearWithShardedLoRA.can_replace_layer  5     ww((%# 3% ) 
 
 	
r3   rC   rh   ri   rj   rk   r#   rl   rF   r    rn   r   ro   rp   r   rq   r   rr   rg   rs   rt   s   @r1   r   r     s         5< EL    ) )u| )5<$+> )%, ) ) ) )  15
 
i
  
 "	

 '-
 

 
 
 
 
   [
 
 
 
 
r3   r   c                        e Zd ZdZdeej        dz           deej        dz           fdZddej        dej        dz  dej        fdZe	e
	 dd	ej        d
edededz  def
 fd                        Z xZS ))MergedColumnParallelLinearWithShardedLoRAz
    Differs from MergedColumnParallelLinearWithLoRA by slicing the
    LoRA A's also.

    Based on S-LoRA, slicing happens along the rank dim.
    rA   Nr5   c                     | j         d         j        d         }| j        |z  }|d         |d         |||z   d d f         nd |d         |d         |||z   d d f         nd g}|S )Nr   r   r   r   )r?   rA   output_shard_sizeoutput_start_idxs       r1   rF   z6MergedColumnParallelLinearWithShardedLoRA.slice_lora_a  s     !/28;<*;; ay$ 1I&)9<M)MMqqqPQQay$ 1I&)9<M)MMqqqPQQ
 r3   r*   r+   c                 $    t          |||           S rC   r   r   s      r1   r    z/MergedColumnParallelLinearWithShardedLoRA.apply  r   r3   r]   r^   r_   r`   c                 P    t                                          ||||d          S r   r   r   s        r1   rg   z;MergedColumnParallelLinearWithShardedLoRA.can_replace_layer  r   r3   rC   rh   ri   rj   rk   rq   r#   rl   rF   r    rn   r   ro   rp   r   r   rr   rg   rs   rt   s   @r1   r   r     s        5<$./	elT!	"    ) )u| )5<$+> )%, ) ) ) )  15
 
i
  
 "	

 '-
 

 
 
 
 
   [
 
 
 
 
r3   r   c                        e Zd ZdZdej        dej        fdZddej        dej        dz  dej        fdZee		 dd	e
j        d
edededz  def
 fd                        Z xZS ) QKVParallelLinearWithShardedLoRAz
    Differs from QKVParallelLinearWithLoRA by slicing the
    LoRA A's also.

    Based on S-LoRA, slicing happens along the rank dim.
    rA   r5   c                 l    | j         d         j        d         }| j        |z  }||||z   d d f         }|S r   r   r   s       r1   rF   z-QKVParallelLinearWithShardedLoRA.slice_lora_a   r   r3   Nr*   r+   c                 $    t          |||           S rC   r   r   s      r1   r    z&QKVParallelLinearWithShardedLoRA.apply  r   r3   r]   r^   r_   r`   c                 P    t                                          ||||d          S r   r   r   s        r1   rg   z2QKVParallelLinearWithShardedLoRA.can_replace_layer	  r   r3   rC   r   rt   s   @r1   r   r     s         5< EL    ) )u| )5<$+> )%, ) ) ) )  15
 
i
  
 "	

 '-
 

 
 
 
 
   [
 
 
 
 
r3   r   c                        e Zd ZdZdeej        dz           deej        dz           fdZddej        dej        dz  dej        fdZe	e
	 dd	ej        d
edededz  def
 fd                        Z xZS )&MergedQKVParallelLinearWithShardedLoRAz
    Differs from MergedQKVParallelLinearWithLoRA by slicing the
    LoRA A's also.

    Based on S-LoRA, slicing happens along the rank dim.
    rA   Nr5   c                      fdt          d          D              fdt          d          D             }|d         )|d         |d         |d         d         z   d d f         nd |d         )|d         |d         |d         d         z   d d f         nd |d         )|d         |d         |d         d         z   d d f         nd g}|S )Nc                 @    g | ]}j         |         j        d          S )r   )r   r"   )ry   r   r?   s     r1   
<listcomp>zGMergedQKVParallelLinearWithShardedLoRA.slice_lora_a.<locals>.<listcomp>(  s)    HHH!d)!,215HHHr3   rc   c                 0    g | ]}j         |         z  S rD   )rK   )ry   r   r?   rM   s     r1   r   zGMergedQKVParallelLinearWithShardedLoRA.slice_lora_a.<locals>.<listcomp>)  s$    DDDaT\JqM1DDDr3   r   r   r   )r   )r?   rA   rQ   rM   s   `  @r1   rF   z3MergedQKVParallelLinearWithShardedLoRA.slice_lora_a$  s    IHHHuQxxHHH
DDDDD588DDD	 ay$ 1IilYq\JqM%AA111DEEay$ 1IilYq\JqM%AA111DEEay$ 1IilYq\JqM%AA111DEE

 r3   r*   r+   c                 $    t          |||           S rC   r   r   s      r1   r    z,MergedQKVParallelLinearWithShardedLoRA.apply7  r   r3   r]   r^   r_   r`   c                 P    t                                          ||||d          S r   r   r   s        r1   rg   z8MergedQKVParallelLinearWithShardedLoRA.can_replace_layer:  r   r3   rC   r   rt   s   @r1   r   r     s        5<$./	elT!	"   &) )u| )5<$+> )%, ) ) ) )  15
 
i
  
 "	

 '-
 

 
 
 
 
   [
 
 
 
 
r3   r   c                        e Zd ZdZee	 ddej        dede	de
dz  def
d                        Zd	ed
ej        e	ej                 z  dej        e	ej                 z  f fdZ xZS )/MergedColumnParallelLinearVariableSliceWithLoRAzMergedColumnParallelLinear with variable number of slices (3+).

    This handles cases where the checkpoint has a single weight for the whole
    module (not split into slices), but the layer itself has multiple slices.
    Nr]   r^   r_   r`   r5   c                     t          |          t          urdS t          |          dk    rdS t          |          dk    rdS t          |d          ot          |j                  dk    S )NFrc   Tr   rb   )r:   r   r   rd   rb   re   s        r1   rg   zAMergedColumnParallelLinearVariableSliceWithLoRA.can_replace_layerV  s     %???5 "##q((4 "##q((5 L.11 4L-..!3	
r3   r   rA   rG   c                 t   |                      |           t          |t          j                  r|g| j        z  }t          |t          j                  r?| j        j        }g }d}|D ]*}||z   }|                    |||ddf                    |}+|}t                      	                    |||           dS )zSOverride to handle single tensor weights
        that need to be split into slices.r   N)
r   
isinstancer#   rl   r   r   rb   appendr8   r   )
r?   r   rA   rG   rb   lora_b_listrQ   r=   rR   r@   s
            r1   r   z8MergedColumnParallelLinearVariableSliceWithLoRA.set_lorav  s     	 fel++ 	.X-F fel++ 	!?7LKI+ $ $#k1""6)G*;QQQ*>#?@@@#		 F 	/////r3   rC   )rh   ri   rj   rk   rn   r   ro   rp   r   rq   r   rr   rg   r   r#   rl   r   rs   rt   s   @r1   r   r   M  s          # 15
 
i
  
 "	

 '-
 

 
 
 $# [
<00 tEL110 tEL11	0 0 0 0 0 0 0 0 0 0r3   r   )r   r   ) r#   torch.nnro   transformersr   vllm.config.lorar   vllm.distributedr   vllm.distributed.utilsr   !vllm.model_executor.layers.linearr   r   r	   vllm.platformsr
   base_linearr   utilsr   r   r2   r   rv   r   r   r   r   r   r   r   rD   r3   r1   <module>r      s  
        ) ) ) ) ) ) ' ' ' ' ' ' = = = = = = ) ) ) ) ) )         
 , + + + + + 0 0 0 0 0 0 M M M M M M M M0 0 0 0f` ` ` ` `#: ` ` `Ft
 t
 t
 t
 t
)E t
 t
 t
n<Y <Y <Y <Y <Y < <Y <Y <Y~7Y 7Y 7Y 7Y 7Y&H 7Y 7Y 7Y~%
 %
 %
 %
 %
*F %
 %
 %
P+
 +
 +
 +
 +
0R +
 +
 +
\!
 !
 !
 !
 !
'@ !
 !
 !
H.
 .
 .
 .
 .
-L .
 .
 .
bE0 E0 E0 E0 E0&E0 E0 E0 E0 E0r3   