
    .`i2                         d Z ddlmZ ddlZddlmZ ddlmZmZ ddl	m
Z
 erddlmZmZmZmZ ddlmZ d	d
lmZ e G d de                      ZdS )z
Based on:
Chen, L., Ye, Z., Wu, Y., Zhuo, D., Ceze, L., & Krishnamurthy, A. (2023).
Punica: Multi-Tenant LoRA Serving.
https://arxiv.org/abs/2310.18547
    )finalN)LoRAMapping)
HAS_TRITONtriton)round_up)LoRAKernelMetafused_moe_loralora_expandlora_shrink)_custom_ops   )PunicaWrapperBasec                      e Zd ZdZdededej        ez  fdZde	de
edz           d	ed
efdZdej        dej        deej        df         defdZ	 	 d1dej        dej        deej        df         deedf         deddfdZ	 d2dej        dej        dej        deddf
dZdddej        dej        deej        df         deej        df         dedeedf         dej        dz  ddfdZdddej        dej        dej        dej        dej        dz  ddfdZ	 	 d3d ej        d!ed"ed#ed	ed$ej        d%ej        dz  d&edeej        ej        ej        f         fd'Z	 	 	 d4dej        dej        deej        df         deej        df         d(ej        d)ej        d*ej        d+ej        d,ed-ed$ej        d.ed/efd0ZdS )5PunicaWrapperGPUz
    PunicaWrapperGPU is designed to manage and provide metadata for the punica
    kernel. The main function is to maintain the state information for
    Multi-LoRA, and to provide the interface for the punica triton kernel.
    max_num_batched_tokensmax_batchesdevicec                     t          j        | |||           |d         | _        | j        j        | _        t	          j        | j        ||          | _        t	          j        | j        ||          | _        d S )Nlora_config)r   )r   __init__r   	max_lorasr   maketoken_mapping_metaprompt_mapping_meta)selfr   r   r   kwargss        w/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/vllm/lora/punica_wrapper/punica_gpu.pyr   zPunicaWrapperGPU.__init__'   s     	"4)?fUUU!-0)3"0"5N26#
 #
 #
 $2#6N26$
 $
 $
       mappinglora_index_to_idNr   
vocab_sizec                     |j         | _         |                     ||||           | j                            | j                   | j                            | j                   d S )N)
is_prefill_update_base_metadatar   prepare_tensorstoken_lora_indicesr   sampler_indices)r   r   r    r   r!   r   s         r   update_metadataz PunicaWrapperGPU.update_metadata?   se     ",""7,<iTTT 	//0GHHH 001EFFFFFr   yxlora_a_stacked.scalec                     |                     d|j        d                   }t          |||g| j                            |                    d                    |R   dS )a  
        Performs GEMM  for multiple slices of lora_a.

        Semantics:
        for i in range(len(lora_a_stacked)):
            y[i] += (x @ lora_a_stacked[i]) * scale

        Args:
            y (torch.Tensor): Output tensors
            x (torch.Tensor): Input tensor
            lora_a_stacked (tuple[torch.Tensor, ...]): lora_a's weights
            scale (float): Scaling factor for the operation
        r   N)viewshaper   r   	meta_argssize)r   r)   r*   r+   r,   r   s         r   
add_shrinkzPunicaWrapperGPU.add_shrinkN   sv    , FF2qwr{##	
 $..qvvayy99		

 	
 	
 	
 	
 	
 	
r   r   Tlora_b_stackedoutput_slicesoffset_startreturnc                 `   |}|                     d|j        d                   }|j        dk    sJ |                    d          t	          |          k    sJ |                    d          }	t          |||g| j                            |	          R |dd |                    |          }dS )a2  
        Performs GEMM for multiple slices of lora_b.

        Semantics:
            for i in range(len(lora_b_stacked)):
                slice = output_slices[i]
                y[:, offset:offset+slice] += x[i] @ lora_b_stacked[i]
                offset += slice

        Args:
            y (torch.Tensor): Output tensor.
            x (torch.Tensor): Input tensors
            lora_b_stacked (tuple[torch.Tensor, ...]): lora_b's weight
            output_slices (tuple[int, ...]): Every slice's size
            add_inputs (bool): Defaults to True.
        r.      r   r   Tr6   
add_inputsN)	r/   r0   ndimr2   lenr
   r   r1   view_as)
r   r)   r*   r4   r5   r6   r;   r   y_org
num_tokenss
             r   
add_expandzPunicaWrapperGPU.add_expandm   s    4 FF2qwr{##v{{{{vvayyC......VVAYY
	
 $..z::		
 	

 &	
 	
 	
 	
 IIer   r;   c                     t          |                    d          |f|g| j                            |                    d                    R d|d dS )a]  
        Applies lora  specifically for VocabParallelEmbeddingWithLoRA.

        Semantics:
            y += x @ lora_b_stacked

        Args:
            y (torch.Tensor): Output tensor.
            x (torch.Tensor): Input tensor.
            lora_b_stacked (torch.Tensor): lora_b's weights.
            add_inputs (bool): Default to True.
        r   dimr:   N)r
   	unsqueezer   r1   r2   )r   r)   r*   r4   r;   r   s         r   add_lora_embeddingz#PunicaWrapperGPU.add_lora_embedding   sv    * 	KKAK	
 $..qvvayy99		
 	

 !	
 	
 	
 	
 	
 	
r   )bufferrG   c                   t          |          t          |          cxk    rt          |          k    sn J |
J d            |d                             d          }	t          j        t          |          |                    d          |	ft          j        |j                  } | j        ||||fi |  | j        ||||fddi| dS )a5  
        Applicable to linear-related lora.

        Semantics:
            for i in range(len(lora_a_stacked)):
                y[i] += (
                    x[i].unsqueeze(0)
                    @ lora_a_stacked[indices[i], layer_idx, :, :]
                    @ lora_b_stacked[indices[i], layer_idx, :, :]
                    * scale
                    ).squeeze(0)
        Args:
            y (torch.Tensor): Output tensor. Will be changed in-place.
            x (torch.Tensor): Input tensor
            lora_a_stacked (tuple[torch.Tensor, ...]): lora_a's weight.
            lora_b_stacked (tuple[torch.Tensor, ...]): lora_b's weight.
            scale (float): Scaling factor.
            output_slices (tuple[int, ...]): Every slice's size.
            buffer (Optional[torch.Tensor]): Defaults to None.
        NdTo minimize overhead, the buffer should be created by .add_lora_linear() instead of being passed in.r   r.   dtyper   r;   T)r=   r2   torchemptyfloat32r   r3   rA   )
r   r)   r*   r+   r4   r,   r5   rG   r   rs
             r   add_lora_linearz PunicaWrapperGPU.add_lora_linear   s'   B >""c.&9&9OOOOS=O=OOOOOOO~~= ~~ 1""2&& A.emAH
 
 
 			
 	

 	
 	
 	
 			
 	

 	
 	
 	
 	
 	
 	
r   c                   |}|                     d|j        d                   }|                     d|j        d                   }|                    d          }	|
J d            t          j        |                    d          |	ft          j        |j                  }t          ||g|                    d          g| j	        
                    |                    d                    |R   t          |                    d          |g|g| j	        
                    |                    d                    R ddi |                    |          }dS )	a  
        Applies lora  specifically for LogitsProcessorWithLoRA.

        Semantics:
            buffer = (x @ lora_a_stacked) * scale
            y += buffer @ lora_b_stacked

        Args:
            y (torch.Tensor): Output tensor.
            x (torch.Tensor): Input tensor.
            lora_a_stacked (torch.Tensor): lora_a's weights.
            lora_b_stacked (torch.Tensor): lora_b's weights.
            scale (float): Scaling factor.
            buffer (Optional[torch.Tensor]): Default to None.
        r.   NrI   r   rJ   rC   r;   T)r/   r0   r2   rL   rM   rN   r   r   rE   r   r1   r
   r>   )
r   r)   r*   r+   r4   r,   rG   r   r?   rO   s
             r   add_lora_logitsz PunicaWrapperGPU.add_lora_logits   sq   4 FF2qwr{##FF2qwr{####~~= ~~ affQii^5=RRR##	
 %//q		::		

 	
 	
 	
 	
 	##	
 %//A??		
 	
 	

 	
 	
 	
 IIer   Ftopk_idsr@   
block_sizenum_expertsadapter_enabled
expert_mappad_sorted_idsc	                    |                                 ||dz
  z  z   }	|rt          |	|          }	t          j        ||	z  ft          j        |j                  }
t          j        |	|          }t          j        ||z  ft          j        |j                  }t          j        |t          j        |j                  }| j        	                    |          \  }}}}}}t          j        ||||||	||
||||           |||         }|
||fS )z~
        Aligns tokens and experts into block-sized chunks for LoRA-based
        mixture-of-experts (MoE) execution.
        r   rJ   )numelr   rL   rM   int32r   r   cdivr   r1   opsmoe_lora_align_block_size)r   rS   r@   rT   rU   r   rV   rW   rX   max_num_tokens_padded
sorted_idsmax_num_m_blocks
expert_idsnum_tokens_post_padtoken_lora_mapping_lora_idss                    r   r^   z*PunicaWrapperGPU.moe_lora_align_block_size/  sS    !) 0 0;*q.3Q Q 	P$,-BJ$O$O![..0+?
 
 


 ";'<jII[))++?
 
 


 $ku{8?
 
 
 6:5L5V5V6
 6
2	Q1h 	%!	
 	
 	
 !#J/J:':::r   topk_weightssorted_token_idsrb   num_tokens_post_paddedmax_lora_rank	top_k_numfully_shardedoffsetc                    | j                             |                    d                    \  }}}}}}t          |||||||||	|
|||                    dd          |                    dd          |                    dd          |                    dd          |                    d	d
          |                    dd          |                    dd          |                    dd          |                    dd          |                    dd          |                    dd          |                    d	d
          |                    dd          |                    dd          |||           dS )zb
        Performs a fused forward computation for LoRA of Mixture-of-Experts (MoE) layer.
        r   BLOCK_SIZE_M@   BLOCK_SIZE_NBLOCK_SIZE_K    GROUP_SIZE_M   	NUM_WARPS   
NUM_STAGESr9   SPLIT_Kr   N)r   r1   r2   r	   get)r   r)   r*   r+   r4   rg   rh   rb   ri   rj   rk   shrink_configexpand_configrV   mul_routed_weightrl   rm   re   rf   s                      r   add_lora_fused_moez#PunicaWrapperGPU.add_lora_fused_moeh  s   , %)$;$E$EaffQii$P$P!Aq!Xq"nb11nb11nb11na00k1--lA..i++nb11nb11nb11na00k1--lA..i++;	
 	
 	
 	
 	
r   )r   T)T)NF)FFr   )__name__
__module____qualname____doc__intrL   r   strr   r   listr(   Tensortuplefloatr3   rA   boolrF   rP   rR   r^   r~    r   r   r   r      s        
 #
 
 s"	
 
 
 
0GG sTz*G 	G
 G G G G
<
 <
 elC/0	

 
 
 
 
J * *<* <* elC/0	*
 S#X* * 
* * * *b  
 
<
 <
 	

 
 

 
 
 
N '+=
 =
 =
<=
 <=
 elC/0	=

 elC/0=
 =
 S#X=
 t#=
 
=
 =
 =
 =
N '+7 7 7<7 <7 	7
 7 t#7 
7 7 7 7B +/$7; 7;,7; 7; 	7;
 7; 7; 7; L4'7; 7; 
u|U\5<7	87; 7; 7; 7;P  ##5
 5
<5
 <5
 elC/0	5

 elC/05
 l5
  ,5
 L5
 !&5
 5
 5
 5
  !5
" #5
 5
 5
 5
 5
 5
r   r   )r   typingr   rL   vllm.lora.layersr   vllm.triton_utilsr   r   vllm.utils.math_utilsr   vllm.lora.ops.triton_opsr   r	   r
   r   vllmr   r]   punica_baser   r   r   r   r   <module>r      s           ( ( ( ( ( ( 0 0 0 0 0 0 0 0 * * * * * *             $ # # # # # * * * * * * }
 }
 }
 }
 }
( }
 }
 }
 }
 }
r   