
    *`iH                        d dl Z d dlmZmZmZmZmZmZ d dlm	Z	m
Z
 d dlmZ d dlmZ d dlmZ d dlmZ d dlmZ d d	lmZ d d
lmZmZmZ g dZdZ G d de          Zdedee         deeef         deee         eeef         f         fdZdedefdZ dedeeegee         f         defdZ!dedeeegee         f         defdZ"dS )    N)AnyCallableDictListOptionalTuple)ReferenceTyperef)forward_quantize)getattr_chain)InternalModule)Tensor)Module)RemovableHandle)CachePretrainedConfigPreTrainedModel)QuantizedKVCacheinitialize_hooked_kv_cacheregister_key_hookregister_value_hookKV_CACHE_ATTRkv_cachec                        e Zd ZdZdedef fdZdeeef         fdZ	dededeeef         fd	Z
d
ee         fdZ xZS )r   ai  
    QuantizedKVCache module which wraps the functionality of any existing kvcache args.
    Unlike transform Cache instances, this cache is a `torch.nn.Module` which can be
    hooked to trigger transforms and calibration hooks.

    This module works by being registered as a submodule to attention modules via
    `initialize_hooked_kv_cache`, then adding a hook which replaces `past_key_values`
    kwargs with this module. This module adopts the functionality of the replaced cache,
    preserving caching functionality such as sliding window attention, ect.

    :param attn_module: parent attention module
    configattn_modulec                     t                                                       || _        t          |          | _        d | _        d S N)super__init__r   r
   r   past_key_values)selfr   r   	__class__s      w/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/compressed_tensors/modeling/kvcache.pyr    zQuantizedKVCache.__init__6   s>    {++?C    returnc                      | |i |S r    )r"   argskwargss      r$   updatezQuantizedKVCache.update<   s    tT$V$$$r%   
key_statesvalue_statesc                 4   |                                  }d}t          ||d           }t          |dd          }|&|r$t          ||d|          }t          ||d|          }| j        % |                                 j        ||g|R i |}	n||f}	d | _        |	S )Nz%quantization_scheme.input_activationsquantization_enabledTkv)r   r   getattrr   r!   r+   )
r"   r,   r-   r)   r*   modulequant_args_attr
quant_argsquant_enabledrets
             r$   forwardzQuantizedKVCache.forward?   s     !!##A"6?DAA
(>EE!m!)&*c:NNJ+FL#zRRL +/$&&((/L+/  39 CC |,C#
r%   r!   c                 D    |t          |          | _        d S d | _        d S r   )r
   r!   )r"   r!   s     r$   add_past_key_valuesz$QuantizedKVCache.add_past_key_valuesZ   s,    &#&#7#7D   #'D   r%   )__name__
__module____qualname____doc__r   r   r    r   r   r+   r8   r   r   r:   __classcell__)r#   s   @r$   r   r   (   s         D/ Df D D D D D D%vv~)> % % % %  
vv~	   6(8E? ( ( ( ( ( ( ( (r%   r   r3   r)   r*   r&   c                     dt          j        | j                  j        v rdnd}|                    |d          }t          | t                    }|                    |           |||<   ||fS )aZ  
    Hook which should be called before each quantized attention forward pass.
    This hook dynamically replaces the `past_key_values` kwarg to the attention
    forward function.

    The original kvcache object is assigned to QuantizedKVCache().past_key_values
    as a weakref to maintain original cache functionality and compute savings
    r!   past_key_valueN)inspect	signaturer8   
parametersgetr2   r   r:   )r3   r)   r*   _past_kv_namer!   caches         r$   _kv_cache_attention_hookrH   d   s{      1&. A A LLL 	 
 (.zz-'F'FO%fm<<E	o...!F=<r%   modelc                     t          |t                    sL|                    t          t          | j        |                     |                    t          d           dS dS )z
    Initialize a `QuantizedKVCache` instance attached to attention

    :param model: parent model of attention module
    :param module: attention module to initialize with
    Twith_kwargsN)hasattrr   register_moduler   r   register_forward_pre_hookrH   )rI   r3   s     r$   r   r   }   sf     6=)) U}.>u|V.T.TUUU(()At(TTTTTU Ur%   hookc                 z     t           t                    }dt          f fd}|                    |d          S )z
    Register a hook which takes post-rope key states as an argument and
    returns the modified key states or `None`

    :param module: attention module to add hook to
    :param hook: key hook function
    rG   c                      t          j        | j                  j        |i |} |j        d                   }|
||j        d<   |j        |j        fS )Nr,   rB   rC   r8   bind	argumentsr)   r*   rG   r)   r*   boundvaluerP   r3   s        r$   _hookz register_key_hook.<locals>._hook   s`    5!%-005tFvFFVU_\:;;,1EOL)z5<''r%   TrK   r2   r   r   rO   r3   rP   r   rY   s   ``  r$   r   r      [     ")!?!?H(% ( ( ( ( ( ( ( --e-FFFr%   c                 z     t           t                    }dt          f fd}|                    |d          S )z
    Register a hook which takes value states as an argument and
    returns the modified value states or `None`

    :param module: attention module to add hook to
    :param hook: value hook function
    rG   c                      t          j        | j                  j        |i |} |j        d                   }|
||j        d<   |j        |j        fS )Nr-   rS   rV   s        r$   rY   z"register_value_hook.<locals>._hook   s`    5!%-005tFvFFVU_^<==.3EON+z5<''r%   TrK   rZ   r[   s   ``  r$   r   r      r\   r%   )#rB   typingr   r   r   r   r   r   weakrefr	   r
   1compressed_tensors.quantization.lifecycle.forwardr   compressed_tensors.utilsr   !compressed_tensors.utils.internalr   torchr   torch.nnr   torch.utils.hooksr   transformersr   r   r   __all__r   r   strrH   r   r   r   r(   r%   r$   <module>rj      s=    = = = = = = = = = = = = = = = = & & & & & & & & N N N N N N 2 2 2 2 2 2 < < < < < <             - - - - - - A A A A A A A A A A   6( 6( 6( 6( 6(~ 6( 6( 6(xs)-1#s(^
49d38n$%   2	Uo 	Uv 	U 	U 	U 	UGG"FF#3Xf5E#EFGG G G G.GG"FF#3Xf5E#EFGG G G G G Gr%   