
    .`iv=                         d Z ddlmZmZ ddlmZ ddlZddlmZm	Z	 erddl
mZ  G d d	e          Z G d
 de          ZdS )z
Based on:
Chen, L., Ye, Z., Wu, Y., Zhuo, D., Ceze, L., & Krishnamurthy, A. (2023).
Punica: Multi-Tenant LoRA Serving.
https://arxiv.org/abs/2310.18547
    )ABCabstractmethod)TYPE_CHECKINGN   )compute_metaconvert_mapping)LoRAMappingc                       e Zd ZdZedddeedz           dededdf
d	            Zed
ee	j
        df         e	j
        z  de	j
        dee	j
        df         dede	j
        dz  f
d            Ze	 	 dd
e	j
        dee	j
        df         e	j
        z  dee	j
        df         deedf         dede	j
        dz  fd            Ze	 dd
e	j
        de	j
        de	j
        dede	j
        dz  f
d            Zeddd
e	j
        de	j
        dee	j
        df         dee	j
        df         dedeedf         dee	j
        df         dz  de	j
        dz  fd            Zeddd
e	j
        de	j
        de	j
        de	j
        de	j
        dz  de	j
        dz  fd            ZdS )PunicaWrapperABCz
    PunicaWrapper ABC.
    mappingr	   lora_index_to_idN	max_loras
vocab_sizereturnc                     t           )z2
        Update the lora-related metadata
        NotImplementedErrorselfr   r   r   r   kwargss         x/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/vllm/lora/punica_wrapper/punica_base.pyupdate_metadataz PunicaWrapperABC.update_metadata   s
     "!    y.xlora_a_stackedscalec                     t           )z?
        Performs GEMM  for multiple slices of lora_a.
        r   r   r   r   r   r   r   s         r   
add_shrinkzPunicaWrapperABC.add_shrink)   
     "!r   r   Tlora_b_stackedoutput_slicesoffset_startc                     t           )z>
        Performs GEMM for multiple slices of lora_b.
        r   r   r   r   r"   r#   r$   
add_inputsr   s           r   
add_expandzPunicaWrapperABC.add_expand8   s
     "!r   r'   c                     t           )z
        Applies lora  specifically for VocabParallelEmbeddingWithLoRA,
        and this layer only requires the expand operation.
        r   r   r   r   r"   r'   r   s         r   add_lora_embeddingz#PunicaWrapperABC.add_lora_embeddingH   r!   r   bufferr-   c                    t           )z4
        Applicable to linear-related lora.
        r   	r   r   r   r   r"   r   r#   r-   r   s	            r   add_lora_linearz PunicaWrapperABC.add_lora_linearW   s
    " "!r   c                    t           )zI
        Applies lora  specifically for LogitsProcessorWithLoRA.
        r   r   r   r   r   r"   r   r-   r   s           r   add_lora_logitsz PunicaWrapperABC.add_lora_logitsj   s
     "!r   r   TT)__name__
__module____qualname____doc__r   listintr   tupletorchTensorfloatr    r(   boolr+   r0   r3    r   r   r   r      s%         "" sTz*" 	"
 " 
" " " ^" "s"#el2" <" elC/0	"
 " 
	" " " ^"  " "<" s"#el2" elC/0	"
 S#X" " 
	" " " ^"   " "<" <" 	"
 " 
	" " " ^"  37" " "<" <" elC/0	"
 elC/0" " S#X" elC'(4/" 
	" " " ^"$  '+" " "<" <" 	"
 " t#" 
	" " " ^" " "r   r   c                      e Zd ZdZdededej        ez  fdZddde	ed	z           d
edefdZ
dej        dd	fdZedeej        ej        ej        eeef         fd            Zedej        fd            Zedej        fd            Zedej        fd            Zedej        fd            Zddde	ed	z           d
edefdZedeej        df         ej        z  dej        deej        df         dedej        d	z  f
d            Ze	 	 d:dej        deej        df         ej        z  deej        df         deedf         d edej        d	z  fd!            Ze	 d;dej        dej        dej        d"edej        d	z  f
d#            Zed	d$dej        dej        deej        df         deej        df         dedeedf         d%eej        df         d	z  dej        d	z  fd&            Zed	d$dej        dej        dej        dej        d%ej        d	z  dej        d	z  fd'            Z	 	 d<d)ej        d*ed+ed,ed
ed-ej        d.ej        d	z  d/edeej        ej        ej        f         fd0Z	 	 	 d=dej        dej        deej        df         deej        df         d1ej        d2ej        d3ej        d4ej        d5ed6ed-ej        d7ed8efd9Zd	S )>PunicaWrapperBasez
    PunicaWrapperBase is designed to manage and provide metadata for the punica
    kernel. The main function is to maintain the state information for
    Multi-LoRA, and to provide the interface for the punica.
    max_num_batched_tokensmax_batchesdevicec                    t          j        |t           j        |          | _        t          j        |t           j        |          | _        t          j        |t           j        |          | _        t          j        d|t           j        |          | _        d gdz  | _        t          j        |t           j        |          | _        t          j        |t           j        |          | _	        t          j        |t           j        |          | _
        || _        d| _        d| _        d| _        d| _        d| _        d S )N)dtyperF         r   F)r=   emptylong_token_lora_indices_sampler_indices_sampler_indices_padded_embeddings_indicesindices_len_seq_start_locs_seq_lengths_lora_indices_per_batchrF   
max_length
token_nums
batch_size
is_prefillno_lora)r   rD   rE   rF   r   s        r   __init__zPunicaWrapperBase.__init__   s.    $);"%*V$
 $
 $
  !&"%*V!
 !
 !
 (-{"%*V(
 (
 (
$ $);%UZ$
 $
 $
  /3VaZ${;ejQWXXX!K5:fUUU',{uz&(
 (
 (
$ %+  !r   r   r	   r   Nr   r   c                    d}t          |||||| j                  \  }}}}	}
| j        d |j        d                                      |           | j        d |j        d                                      |           | j        d |j        d                                      |           | j        d |	j        d         d |	j        d         f                             |	           |
| j        d d <   d S )Nr   r   )	r   rF   rN   shapecopy_rO   rP   rQ   rR   )r   r   r   r   r   extra_vocab_sizebase_indicessampler_indicessampler_indices_paddedembeddings_indicesrR   s              r   _update_base_metadataz'PunicaWrapperBase._update_base_metadata   s#     K
 
	
" 	 !8<#5a#8!89??MMM8 5a 889??PPP$%F'='CA'F%FGMM"	
 	
 	
 	 ) &q))+H-?-Ea-H+HH	

%"
#
#
#)r   token_lora_tensorr   c                 z   t          |          \  }}}}}}}| j        d |j        d                                      |           | j        d |j        d                                      |           | j        d |j        d                                      |           || _        || _        || _        || _	        d S )Nr   )
r   rS   r]   r^   rT   rU   rX   rV   rW   rZ   )	r   re   b_seq_start_tensorseq_length_tensorlora_indices_tensorrX   rV   rW   rZ   s	            r   _update_prefill_metadataz*PunicaWrapperBase._update_prefill_metadata   s     *++	
 	:17::;AABTUUU6-3A667==>OPPP$%C':'@'C%CDJJ	
 	
 	
 %$$r   c                     | j         d| j                 | j        d| j                 | j        d| j                 | j        | j        | j        fS )aY  
        This property provides a convenient way to access the necessary
        metadata for prefill-related  kernel computations.
            1. seq_start_locs: Tensor of sequence start positions.
            2. seq_lengths: Tensor of sequence lengths.
            3. lora_indices_per_batch: Tensor of lora indices, and an index of
                -1 means no lora should be applied.
            4. batch_size: Batch size after clustering identical lora indices.
            5. max_length: The maximum sequence length in the batch.
            6. token_nums: The token numbers in the batch.
        N)rS   rX   rT   rU   rV   rW   )r   s    r   prefill_metadataz"PunicaWrapperBase.prefill_metadata   sV       !24?!23//0():4?):;OOO
 	
r   c                 :    | j         d         }| j        d|         S )z
        This property provides the lora indices corresponding to each token
        in the batch. An index of -1 means no lora should be applied.
        r   N)rR   rN   )r   token_lora_lens     r   token_lora_indicesz$PunicaWrapperBase.token_lora_indices   s#     )!,'88r   c                 :    | j         d         }| j        d|         S )zt
        This property is used to access the lora indices specifically for
        LogitsProcessorWithLoRA.
        r   N)rR   rO   )r   sampler_indices_lens     r   ra   z!PunicaWrapperBase.sampler_indices  s'     #.q1$%9&9%9::r   c                 :    | j         d         }| j        d|         S )zJ
        This property provides access to padded sampler indices.
        rI   N)rR   rP   )r   indices_padded_lens     r   rb   z(PunicaWrapperBase.sampler_indices_padded  s'    
 "-a0+,?-?,?@@r   c                 B    | j         d         }| j        ddd|f         S )z
        This property provides access to the indices used for lora embeddings,
        specifically for VocabParallelEmbeddingWithLoRA.
           N)rR   rQ   )r   embeddings_indices_lens     r   rc   z$PunicaWrapperBase.embeddings_indices  s0     "&!1!!4'+B,B+B(BCCr   c                     |                      ||||           |j        r#|                     | j                   d| _        d S d| _        d S )NTF)rd   rY   rj   ro   r   s         r   r   z!PunicaWrapperBase.update_metadata  sW     	""7,<iTTT 	$))$*ABBB"DOOO#DOOOr   r   .r   r   r   c                     t           )a  
        Performs GEMM  for multiple slices of lora_a.

        Semantics:
        for i in range(len(lora_a_stacked)):
            y[i] += (x @ lora_a_stacked[i]) * scale

        Args:
            y (Union[tuple[torch.Tensor, ...], torch.Tensor]): Output tensors
            x (torch.Tensor): Input tensor
            lora_a_stacked (tuple[torch.Tensor, ...]): lora_a's weights
            scale (float): Scaling factor for the operation

        r   r   s         r   r    zPunicaWrapperBase.add_shrink-  
    0 "!r   r   Tr"   r#   r$   c                     t           )a  
        Performs GEMM for multiple slices of lora_b.

        Semantics:
            offset = offset_start
            for i in range(len(lora_b_stacked)):
                slice = output_slices[i]
                y[:, offset:offset+slice] += x[i] @ lora_b_stacked[i]
                offset += slice

        Args:
            y (torch.Tensor): Output tensor.
            x (Union[tuple[torch.Tensor, ...], torch.Tensor]): Input tensors
            lora_b_stacked (tuple[torch.Tensor, ...]): lora_b's weight
            output_slices (tuple[int, ...]): Every slice's size
            offset_start (int): The starting position of y, defaults to 0
            add_inputs (bool):  Defaults to True.

        r   r&   s           r   r(   zPunicaWrapperBase.add_expandG  s
    > "!r   r'   c                     t           )a  
        Applies lora  specifically for VocabParallelEmbeddingWithLoRA.
        and this layer only requires the expand operation.
        Semantics:
            y += x @ lora_b_stacked

        Args:
            y (torch.Tensor): Output tensor.
            x (torch.Tensor): Input tensor.
            lora_b_stacked (torch.Tensor): lora_b's weights.
            add_inputs (bool): Default to True.
        r   r*   s         r   r+   z$PunicaWrapperBase.add_lora_embeddingh  s
    , "!r   r,   r-   c                    t           )aB  
        Applicable to linear-related lora.

        Semantics:
            for i in range(len(lora_a_stacked)):
                y[i] += (
                    x[i].unsqueeze(0)
                    @ lora_a_stacked[indices[i], layer_idx, :, :]
                    @ lora_b_stacked[indices[i], layer_idx, :, :]
                    * scale
                    ).squeeze(0)

        Args:
            y (torch.Tensor): Output tensor. Will be changed in-place.
            x (torch.Tensor): Input tensor
            lora_a_stacked (tuple[torch.Tensor, ...]): lora_a's weight.
            lora_b_stacked (tuple[torch.Tensor, ...]): lora_b's weight.
            scale (float): Scaling factor.
            output_slices (tuple[int, ...]): Every slice's size.
            buffer (Optional[tuple[torch.Tensor, ...]]): Defaults to None.
        r   r/   s	            r   r0   z!PunicaWrapperBase.add_lora_linear  s    F "!r   c                    t           )a  
        Applies lora  specifically for LogitsProcessorWithLoRA.

        Semantics:
            buffer = (x @ lora_a_stacked) * scale
            y += buffer @ lora_b_stacked

        Args:
            y (torch.Tensor): Output tensor.
            x (torch.Tensor): Input tensor.
            lora_a_stacked (torch.Tensor): lora_a's weights.
            lora_b_stacked (torch.Tensor):lora_b's weights.
            scale (float): Scaling factor.
            buffer (Optional[torch.Tensor]):Default to None.
        r   r2   s           r   r3   z!PunicaWrapperBase.add_lora_logits  s
    8 "!r   Ftopk_ids
num_tokens
block_sizenum_expertsadapter_enabled
expert_mappad_sorted_idsc	                     t           )z~
        Aligns tokens and experts into block-sized chunks for LoRA-based
        mixture-of-experts (MoE) execution.
        r   )	r   r~   r   r   r   r   r   r   r   s	            r   moe_lora_align_block_sizez+PunicaWrapperBase.moe_lora_align_block_size  s
      "!r   topk_weightssorted_token_ids
expert_idsnum_tokens_post_paddedmax_lora_rank	top_k_numfully_shardedoffsetc                     t           )zj
        Performs a fused forward computation for LoRA of
        Mixture-of-Experts (MoE) layer.
        r   )r   r   r   r   r"   r   r   r   r   r   r   shrink_configexpand_configr   mul_routed_weightr   r   s                    r   add_lora_fused_moez$PunicaWrapperBase.add_lora_fused_moe  ry   r   r4   r5   )NF)FFr   )r6   r7   r8   r9   r;   r=   rF   strr[   r:   rd   r>   rj   propertyr<   rl   ro   ra   rb   rc   r   r   r?   r    r(   r@   r+   r0   r3   r   r   rA   r   r   rC   rC   |   s        # ## # s"	# # # #J"*"* sTz*"* 	"*
 "* "* "* "*H%, 4    * 
	u|U\5<c3F	G
 
 
 X
. 9EL 9 9 9 X9 ; ; ; ; X; A A A A XA DEL D D D XD$$ sTz*$ 	$
 $ $ $ $" "s"#el2" <" elC/0	"
 " 
	" " " ^"2  " "<" s"#el2" elC/0	"
 S#X" " 
	" " " ^"@   " "<" <" 	"
 " 
	" " " ^".  37"" "" ""<"" <"" elC/0	""
 elC/0"" "" S#X"" elC'(4/"" 
	"" "" "" ^""H  '+" " "<" <" 	"
 " t#" 
	" " " ^"J +/$" "," " 	"
 " " " L4'" " 
u|U\5<7	8" " " "B  ##" "<" <" elC/0	"
 elC/0" l"  ," L" !&" " " "  !"" #" " " " " "r   rC   )r9   abcr   r   typingr   r=   utilsr   r   vllm.lora.layersr	   r   rC   rA   r   r   <module>r      s     $ # # # # # # #              0 0 0 0 0 0 0 0 -,,,,,,c" c" c" c" c"s c" c" c"Lq" q" q" q" q"( q" q" q" q" q"r   