
    .`iA*                     b    d dl mZ d dlZd dlmZmZmZmZmZm	Z	 ddl
mZ  G d de          ZdS )    )CallableN)bgmv_expandbgmv_expand_slicebgmv_shrinksgmv_expandsgmv_expand_slicesgmv_shrink   )PunicaWrapperBasec                   .   e Zd ZdZdededej        ez  fdZdej	        dej	        dej	        d	e
fd
Zdej	        dej	        dej	        d	e
fdZdej	        dej	        dej	        defdZdej	        dej	        dej	        defdZdej	        dej	        dej	        dededefdZdej	        dej	        dej	        dededefdZ	 d%dej	        dej	        dej	        dededefdZdej	        dej	        dej	        d	e
fdZdeej	        df         ej	        z  dej	        deej	        df         d	e
fdZ	 	 d&dej	        deej	        df         ej	        z  deej	        df         deedf         deddfdZ	 d%dej	        dej	        dej	        deddf
d Zdd!dej	        dej	        deej	        df         deej	        df         d	e
deedf         d"eej	        df         dz  ddfd#Zdd!dej	        dej	        dej	        dej	        d"ej	        dz  ddfd$ZdS )'PunicaWrapperCPUz
    PunicaWrapperCPU is designed to manage and provide metadata for the punica
    kernel. The main function is to maintain the state information for
    Multi-LoRA, and to provide the interface for the pytorch punica ops.
    max_num_batched_tokensmax_batchesdevicec                 4    t          j        | |||           d S N)r   __init__)selfr   r   r   kwargss        w/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/vllm/lora/punica_wrapper/punica_cpu.pyr   zPunicaWrapperCPU.__init__   s"     	"4)?fUUUUU    yxw_t_allscalec                 D    | j         rd S t          |||g| j        |R   d S r   )no_lorar	   prefill_metadatar   r   r   r   r   s        r   _shrink_prefillz PunicaWrapperCPU._shrink_prefill&   sT     < 	F	
 "		

 	
 	
 	
 	
 	
 	
r   c                 6    t          |||| j        |           d S r   )r   token_lora_indicesr   s        r   _shrink_decodezPunicaWrapperCPU._shrink_decode8   s#     	Aw4#:EBBBBBr   
add_inputsc                 D    | j         rd S t          |||g| j        |R   d S r   )r   r   r   r   r   r   r   r$   s        r   _expand_prefillz PunicaWrapperCPU._expand_prefillA   sT     < 	F	
 "		

 	
 	
 	
 	
 	
 	
r   c                 6    t          |||| j        |           d S r   )r   r"   r&   s        r   _expand_decodezPunicaWrapperCPU._expand_decodeS   s#     	Aw4#:JGGGGGr   y_offsety_slice_sizec                 L    | j         rd S t          |||g| j        |||R   d S r   )r   r   r   r   r   r   r   r*   r+   r$   s          r   _expand_slice_prefillz&PunicaWrapperCPU._expand_slice_prefill\   sh     < 	F	
 "		

 	
 	
 	
 	
 	
 	
 	
 	
r   c           	      :    t          |||| j        |||           d S r   )r   r"   r-   s          r   _expand_slice_decodez%PunicaWrapperCPU._expand_slice_decoder   s2     	w42HlJ	
 	
 	
 	
 	
r   Tc                 P    | j         r| j        n| j        } |||||||           dS )z
        Perform the ` y[:,y_offset:y_offset+y_slice_size]+=x@w_t_all`
        computation, which is suitable for the
        GEMM of lora'b.
        N)
is_prefillr.   r0   )r   r   r   r   r*   r+   r$   expand_slice_funs           r   _apply_expandzPunicaWrapperCPU._apply_expand   sA      +//XD&&t?X 	 	Aw,
KKKKKr   c                     |}|                     d|j        d                   }| j        r| j        n| j        } |||||           |                    |          }dS )af  
        Perform the ` y+=x@w_t_all` computation, which is suitable for the
        GEMM of lora'a.
        When `is_prefill is` true, it indicates that it is currently the
        prefill stage, and the `_shrink_prefill` function should be called.
        Otherwise, it is the decode stage, and the _shrink_decode function
        should be called.
        N)viewshaper2   r    r#   view_as)r   r   r   r   r   y_org
shrink_funs          r   _apply_shrinkzPunicaWrapperCPU._apply_shrink   si     FF2qwr{##$(OLD  9L 	 	
1a%(((IIer   .lora_a_stackedc                     |                     d|j        d                   }t          t          |                    D ]&}|                     ||         |||         |           'dS )a  
        Performs GEMM  for multiple slices of lora_a.
        When `is_prefill is` true, it indicates that it is currently the
        prefill stage, and the `_shrink_prefill` function should be called.
        Otherwise, it is the decode stage, and the _shrink_decode function
        should be called.

        Semantics:
        for i in range(len(lora_a_stacked)):
            y[i] += (x @ lora_a_stacked[i]) * scale

        Args:
            y (Union[tuple[torch.Tensor, ...], torch.Tensor]): Output tensors
            x (torch.Tensor): Input tensor
            lora_a_stacked (tuple[torch.Tensor, ...]): lora_a's weights
            scale (float): Scaling factor for the operation
        r6   N)r7   r8   rangelenr<   )r   r   r   r=   r   r   	slice_idxs          r   
add_shrinkzPunicaWrapperCPU.add_shrink   sp    4 FF2qwr{##s>2233 	R 	RIq|Qy0I5QQQQ	R 	Rr   r   lora_b_stackedoutput_slicesoffset_startreturnNc           	      (   |}|                     d|j        d                   }|}	t          t          |                    D ]:}
|                     |||
         ||
         |	||
         |           |	||
         z  }	;|                    |          }dS )aT  
        Performs GEMM for multiple slices of lora_b.

        Semantics:
            for i in range(len(lora_b_stacked)):
                slice = output_slices[i]
                y[:, offset:offset+slice] += x[i] @ lora_b_stacked[i]
                offset += slice

        Args:
            y (torch.Tensor): Output tensor.
            x (Union[tuple[torch.Tensor, ...], torch.Tensor]): Input tensors
            lora_b_stacked (tuple[torch.Tensor, ...]): lora_b's weight
            output_slices (tuple[int, ...]): Every slice's size
            add_inputs (bool):  Defaults to True.
        r6   r$   N)r7   r8   r?   r@   r4   r9   )r   r   r   rC   rD   rE   r$   r   r:   offset_leftrA   s              r   
add_expandzPunicaWrapperCPU.add_expand   s    4 FF2qwr{##"s>2233 		4 		4I)y)i(%     =33KKIIer   c                 L    | j         r| j        n| j        } |||||           dS )a]  
        Applies lora  specifically for VocabParallelEmbeddingWithLoRA.

        Semantics:
            y += x @ lora_b_stacked

        Args:
            y (torch.Tensor): Output tensor.
            x (torch.Tensor): Input tensor.
            lora_b_stacked (torch.Tensor): lora_b's weights.
            add_inputs (bool): Default to True.
        N)r2   r'   r)   )r   r   r   rC   r$   r   
expand_funs          r   add_lora_embeddingz#PunicaWrapperCPU.add_lora_embedding   s<    . %)OLD  9L 	 	
1a44444r   )bufferrN   c                h  	 t          |          t          |          cxk    rt          |          k    sn J |Q|d                             d          	t          	fdt          t          |                    D                       } | j        |||fi |  | j        ||||fddi| dS )aB  
        Applicable to linear-related lora.

        Semantics:
            for i in range(len(lora_a_stacked)):
                y[i] += (
                    x[i].unsqueeze(0)
                    @ lora_a_stacked[indices[i], layer_idx, :, :]
                    @ lora_b_stacked[indices[i], layer_idx, :, :]
                    * scale
                    ).squeeze(0)

        Args:
            y (torch.Tensor): Output tensor. Will be changed in-place.
            x (torch.Tensor): Input tensor
            lora_a_stacked (tuple[torch.Tensor, ...]): lora_a's weight.
            lora_b_stacked (tuple[torch.Tensor, ...]): lora_b's weight.
            scale (float): Scaling factor.
            output_slices (tuple[int, ...]): Every slice's size.
            buffer (Optional[tuple[torch.Tensor, ...]]): Defaults to None.
        Nr   r6   c              3      K   | ]?}t          j                            d           ft           j        j                  V  @dS )r   dtyper   N)torchzerossizefloat32r   ).0_rr   s     r   	<genexpr>z3PunicaWrapperCPU.add_lora_linear.<locals>.<genexpr>1  sZ         QVVAYYN%-QQQ     r   r$   T)r@   rU   tupler?   rB   rJ   )
r   r   r   r=   rC   r   rD   rN   r   rY   s
     `      @r   add_lora_linearz PunicaWrapperCPU.add_lora_linear	  s   D >""c.&9&9OOOOS=O=OOOOOOO>q!&&r**A      s=1122    F 	>5CCFCCCv~}	
 	
AE	
IO	
 	
 	
 	
 	
r   c                   |}|                     d|j        d                   }|                     d|j        d                   }|                    d          }	|;t          j        |                    d          |	ft          j        |j                  }t          |||| j        |           t          |||| j        d           |
                    |          }dS )a  
        Applies lora  specifically for LogitsProcessorWithLoRA.

        Semantics:
            buffer = (x @ lora_a_stacked) * scale
            y += buffer @ lora_b_stacked

        Args:
            y (torch.Tensor): Output tensor.
            x (torch.Tensor): Input tensor.
            lora_a_stacked (torch.Tensor): lora_a's weights.
            lora_b_stacked (torch.Tensor):lora_b's weights.
            scale (float): Scaling factor.
            buffer (Optional[torch.Tensor]):Default to None.
        r6   Nr   rQ   TrH   )r7   r8   rU   rS   rT   rV   r   r   sampler_indicesr   r9   )
r   r   r   r=   rC   r   rN   r   r:   rY   s
             r   add_lora_logitsz PunicaWrapperCPU.add_lora_logits:  s    4 FF2qwr{##FF2qwr{####> [!&&))Qu}QXVVVFA~vt/CUKKKFNAt/CPTUUUUIIer   )T)r   T)__name__
__module____qualname____doc__intrS   r   strr   Tensorfloatr    r#   boolr'   r)   r.   r0   r4   r<   r[   rB   rJ   rM   r\   r_    r   r   r   r      s        V #V V s"	V V V V
<
 <
 	

 
 
 
 
$C<C <C 	C
 C C C C
<
 <
 	

 
 
 
 
$H<H <H 	H
 H H H H
<
 <
 	

 
 
 
 
 
 
,
<
 <
 	

 
 
 
 
 
 
(  L L<L <L 	L
 L L L L L L("',9>NS   &Rs"#el2R <R elC/0	R
 R R R RJ ' '<' s"#el2' elC/0	'
 S#X' ' 
' ' ' '\  5 5<5 <5 	5
 5 
5 5 5 5H 37/
 /
 /
</
 </
 elC/0	/

 elC/0/
 /
 S#X/
 elC'(4//
 
/
 /
 /
 /
r '+% % %<% <% 	%
 % t#% 
% % % % % %r   r   )collections.abcr   rS   vllm.lora.ops.torch_opsr   r   r   r   r   r	   punica_baser   r   ri   r   r   <module>rm      s    % $ $ $ $ $                 + * * * * *
I I I I I( I I I I Ir   