
    .`iB"                     v    d Z ddlmZ ddlZddlmZ ddlmZmZm	Z	 ddl
mZ e G d d	e                      ZdS )
z
Based on:
Chen, L., Ye, Z., Wu, Y., Zhuo, D., Ceze, L., & Krishnamurthy, A. (2023).
Punica: Multi-Tenant LoRA Serving.
https://arxiv.org/abs/2310.18547
    )finalN)LoRAMapping)bgmv_expandbgmv_expand_slicebgmv_shrink   )PunicaWrapperBasec                      e Zd ZdZdededej        ez  fdZde	de
edz           d	ed
efdZdej        dej        fdZdej        dej        dej        defdZdej        dej        dej        dededefdZdej        dej        deej        df         defdZ	 	 d&dej        dej        deej        df         deedf         deddfdZ	 d'dej        dej        dej        deddf
d Zdd!dej        dej        deej        df         deej        df         dedeedf         d"ej        dz  ddfd#Zedej        fd$            Zdd!dej        dej        dej        dej        d"ej        dz  ddfd%ZdS )(PunicaWrapperXPUz
    PunicaWrapperXPU is designed to manage and provide metadata for the punica
    kernel. The main function is to maintain the state information for
    Multi-LoRA, and to provide the interface for the punica ipex kernel.
    max_num_batched_tokensmax_batchesdevicec                    t          j        | |||           t          j                            | j        d           t          j                            | j        d           t          j                            | j        d           d S )Nr   r   )r	   __init__torch_dynamomark_dynamic_token_lora_indices_embeddings_indices_sampler_indices_padded)selfr   r   r   kwargss        w/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/vllm/lora/punica_wrapper/punica_xpu.pyr   zPunicaWrapperXPU.__init__   ss     	"4)?fUUU""4#;Q???""4#;Q???""4#?CCCCC    mappinglora_index_to_idN	max_loras
vocab_sizec                 N    |j         | _         |                     ||||           d S N)
is_prefill_update_base_metadata)r   r   r   r   r   r   s         r   update_metadataz PunicaWrapperXPU.update_metadata(   s/     ",""7,<iTTTTTr   xreturnc                 `    t          j        | j        dd|                    d                    S )Nr   )r   narrowr   size)r   r$   s     r   _get_token_lora_indicesz(PunicaWrapperXPU._get_token_lora_indices3   s%    |D4aAFF1IIFFFr   yw_t_allscalec                 R    t          ||||                     |          |           d S r    )r   r)   )r   r*   r$   r+   r,   s        r   _apply_shrinkzPunicaWrapperXPU._apply_shrink6   s.     	Aw4#?#?#B#BEJJJJJr   y_offsety_slice_size
add_inputsc           	      Z    |                      |          }t          |||||||           d S r    )r)   r   )r   r*   r$   r+   r/   r0   r1   token_lora_indicess           r   _apply_expandzPunicaWrapperXPU._apply_expand?   sD     "99!<<w-xz	
 	
 	
 	
 	
r   lora_a_stacked.c                     |                     d|j        d                   }t          t          |                    D ]&}|                     ||         |||         |           'dS )a  
        Performs GEMM  for multiple slices of lora_a.

        Semantics:
        for i in range(len(lora_a_stacked)):
            y[i] += (x @ lora_a_stacked[i]) * scale

        Args:
            y (torch.Tensor): Output tensors
            x (torch.Tensor): Input tensor
            lora_a_stacked (tuple[torch.Tensor, ...]): lora_a's weights
            scale (float): Scaling factor for the operation
        N)viewshaperangelenr.   )r   r*   r$   r5   r,   r   	slice_idxs          r   
add_shrinkzPunicaWrapperXPU.add_shrinkM   sp    , FF2qwr{##s>2233 	R 	RIq|Qy0I5QQQQ	R 	Rr   r   Tlora_b_stackedoutput_slicesoffset_startc           	         |}|                     d|j        d                   }|j        dk    sJ |                    d          t	          |          k    sJ t          t	          |                    D ]:}	|                     |||	         ||	         |||	         |           |||	         z  };|                    |           dS )a2  
        Performs GEMM for multiple slices of lora_b.

        Semantics:
            for i in range(len(lora_b_stacked)):
                slice = output_slices[i]
                y[:, offset:offset+slice] += x[i] @ lora_b_stacked[i]
                offset += slice

        Args:
            y (torch.Tensor): Output tensor.
            x (torch.Tensor): Input tensors
            lora_b_stacked (tuple[torch.Tensor, ...]): lora_b's weight
            output_slices (tuple[int, ...]): Every slice's size
            add_inputs (bool): Defaults to True.
        r7      r   r1   N)r8   r9   ndimr(   r;   r:   r4   view_as)
r   r*   r$   r>   r?   r@   r1   r   y_orgr<   s
             r   
add_expandzPunicaWrapperXPU.add_expandg   s    4 FF2qwr{##v{{{{vvayyC...... s>2233 		5 		5I)y)i(%     M)44LL			%r   c                 V    |                      |          }t          |||||           dS )a]  
        Applies lora  specifically for VocabParallelEmbeddingWithLoRA.

        Semantics:
            y += x @ lora_b_stacked

        Args:
            y (torch.Tensor): Output tensor.
            x (torch.Tensor): Input tensor.
            lora_b_stacked (torch.Tensor): lora_b's weights.
            add_inputs (bool): Default to True.
        N)r)   r   )r   r*   r$   r>   r1   r   r3   s          r   add_lora_embeddingz#PunicaWrapperXPU.add_lora_embedding   s5    ( "99!<<A~q*<jIIIIIr   )bufferrJ   c                   t          |          t          |          cxk    rt          |          k    sn J |d|d                             d          }	t          j        t          |          |                    d          |	ft          j        |j                  } | j        ||||fi |  | j        ||||fddi| dS )a6  
        Applicable to linear-related lora.

        Semantics:
            for i in range(len(lora_a_stacked)):
                y[i] += (
                    x[i].unsqueeze(0)
                    @ lora_a_stacked[indices[i], layer_idx, :, :]
                    @ lora_b_stacked[indices[i], layer_idx, :, :]
                    * scale
                    ).squeeze(0)

        Args:
            y (torch.Tensor): Output tensor. Will be changed in-place.
            x (torch.Tensor): Input tensor
            lora_a_stacked (tuple[torch.Tensor, ...]): lora_a's weight.
            lora_b_stacked (tuple[torch.Tensor, ...]): lora_b's weight.
            scale (float): Scaling factor.
            output_slices (tuple[int, ...]): Every slice's size.
            buffer (Optional[torch.Tensor]): Defaults to None.
        Nr   r7   dtyper   r1   T)r;   r(   r   zerosfloat32r   r=   rG   )
r   r*   r$   r5   r>   r,   r?   rJ   r   rs
             r   add_lora_linearz PunicaWrapperXPU.add_lora_linear   s   D >""c.&9&9OOOOS=O=OOOOOOO>q!&&r**A []##QVVAYY2mx  F
 			
 	

 	
 	
 	
 			
 	

 	
 	
 	
 	
 	
 	
r   c                      | j         dd         S )zJ
        This property provides access to padded sampler indices.
        N)r   )r   s    r   sampler_indices_paddedz'PunicaWrapperXPU.sampler_indices_padded   s    
 +AAA..r   c                   |}|                     d|j        d                   }|                     d|j        d                   }|                    d          }	|;t          j        |                    d          |	ft          j        |j                  }t          j        | j        dd|                    d                    }
t          ||||
|           t          ||||
d           |                    |          S )a  
        Applies lora  specifically for LogitsProcessorWithLoRA.

        Semantics:
            buffer = (x @ lora_a_stacked) * scale
            y += buffer @ lora_b_stacked

        Args:
            y (torch.Tensor): Output tensor.
            x (torch.Tensor): Input tensor.
            lora_a_stacked (torch.Tensor): lora_a's weights.
            lora_b_stacked (torch.Tensor): lora_b's weights.
            scale (float): Scaling factor.
            buffer (Optional[torch.Tensor]): Default to None.
        r7   Nr   rL   TrC   )r8   r9   r(   r   rN   rO   r   r'   _sampler_indicesr   r   rE   )r   r*   r$   r5   r>   r,   rJ   r   rF   rP   sampler_indicess              r   add_lora_logitsz PunicaWrapperXPU.add_lora_logits   s    4 FF2qwr{##FF2qwr{####> [!&&))Qu}QXVVVF,t'<aAFF1IINNA~vFFFFNA4PPPPyyr   )r   T)T)__name__
__module____qualname____doc__intr   r   strr   r   listr#   Tensor	IntTensorr)   floatr.   boolr4   tupler=   rG   rI   rQ   propertyrS   rW    r   r   r   r      s        
D #
D 
D s"	
D 
D 
D 
D	U	U sTz*	U 		U
 	U 	U 	U 	UG G%/ G G G GK<K <K 	K
 K K K K
<
 <
 	

 
 
 
 
 
 
R<R <R elC/0	R
 R R R R@ + +<+ <+ elC/0	+
 S#X+ + 
+ + + +d  J J<J <J 	J
 J 
J J J J@ '+;
 ;
 ;
<;
 <;
 elC/0	;

 elC/0;
 ;
 S#X;
 t#;
 
;
 ;
 ;
 ;
z / / / / X/ '+%  %  % <%  <%  	% 
 %  t#%  
%  %  %  %  %  % r   r   )r[   typingr   r   vllm.lora.layersr   vllm.lora.ops.ipex_opsr   r   r   punica_baser	   r   re   r   r   <module>rj      s            ( ( ( ( ( ( N N N N N N N N N N * * * * * *          (          r   