
    *`iV                        d dl Z d dlmZmZ d dlmZ d dlmZ d dlm	Z	 d dl
mZ d dlmZ d dlmZ d d	lmZ d d
lmZmZ d dlmZ d dlmZ g dZdZdZ G d de          ZdefdZdedefdZdedeeegee         f         defdZdS )    N)CallableOptional)initialize_hooked_kv_cache)forward_quantize)getattr_chain)InternalModule)Tensor)Module)RemovableHandle)PretrainedConfigPreTrainedModel)ALL_MASK_ATTENTION_FUNCTIONS)ALL_ATTENTION_FUNCTIONS)QuantizedAttentionImplinitialize_hooked_attentionregister_query_hook	IMPL_ATTRimplct_hooked_attentionc                   D     e Zd ZdZdZdef fdZdedededefd	Z	 xZ
S )
r   a  
    QuantizedAttentionImpl module which wraps the functionality of the original
    attention implementation. Unlike the original attention function, this
    implementation is a `torch.nn.Module` which can be hooked to trigger
    transforms and calibration hooks.

    This module works by being registered as a submodule to attention modules via
    `initialize_hooked_attention`, registering a new attention implementation function
    which calls this module, then setting the model attention implementation to the new
    function. After triggering hooks and quantization, this module calls the original
    attention implementation function.
    eagerconfigc                 V    t                                                       || _        d S )N)super__init__r   )selfr   	__class__s     y/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/compressed_tensors/modeling/attention.pyr   zQuantizedAttentionImpl.__init__:   s$        modulequerykeyvaluec                     d}t          ||d           }t          |dd          }	||	rt          ||d|          }t          t          j                 ||||g|R i |S )Nz%quantization_scheme.input_activationsquantization_enabledTq)r   getattrr   r   r   _original_impl)
r   r    r!   r"   r#   argskwargsquant_args_attr
quant_argsquant_enableds
             r   forwardzQuantizedAttentionImpl.forward>   s     B"6?DAA
(>EE!m!$VUCDDE ''='LM	

 
 
 
 
 
 	
r   )__name__
__module____qualname____doc__r(   r   r   r
   r	   r.   __classcell__)r   s   @r   r   r   *   s          N/      

 
 	

 
 
 
 
 
 
 
 
r   r   r    c                     t          | t                    sJ dt           dt           d             t          | t                    | g|R i |S )NzUsing z> attention implementation, but attention module does not have z submodule.)hasattrr   HOOKED_ATTENTION_NAMEr'   )r    r)   r*   s      r   _hooked_attentionr7   \   s|    69%%  	E& 	E 	E.7	E 	E 	E %
 &769%%f>t>>>v>>>r   modelc                    t          |t                    s-|                    t          t          | j                             | j        j        t          k    r| j        j        t          _        t          | j        j                 }t          j
        t          t                     t          j
        t          |           |                     t                     | j        j        t          k    sJ t          | |           dS )a  
    Initialize `QuantizedAttentionImpl` and `QuantizedKVCache` instances
    attached to attention. Assumes that only one model is hooked at a time.

    :param model: parent model of attention module
    :param module: attention module to initialize with
    N)r5   r   register_moduler   r   _attn_implementationr6   r(   r   r   registerr7   set_attn_implementationr   )r8   r    original_masks      r   r   r   e   s     69%% Py*@*N*NOOO|(,AAA050Q-4U\5VW()>@QRRR$-.C]SSS%%&;<<<|04IIIIIuf-----r   hookreturnc                 z     t           t                    }dt          f fd}|                    |d          S )z
    Register a hook which takes post-rope query states as an argument and
    returns the modified query states or `None`

    :param module: attention module to add hook to
    :param hook: query hook function
    r   c                      t          j        | j                  j        |i |} |j        d                   }|
||j        d<   |j        |j        fS )Nr!   )inspect	signaturer.   bind	argumentsr)   r*   )r   r)   r*   boundr#   r?   r    s        r   _hookz"register_query_hook.<locals>._hook   s`    4!$,//4dEfEEVU_W566',EOG$z5<''r   T)with_kwargs)r'   r   r   register_forward_pre_hook)r    r?   r   rH   s   ``  r   r   r      s[     $+69#=#=D(* ( ( ( ( ( ( ( ))%T)BBBr   ) rC   typingr   r   #compressed_tensors.modeling.kvcacher   1compressed_tensors.quantization.lifecycle.forwardr   compressed_tensors.utilsr   !compressed_tensors.utils.internalr   torchr	   torch.nnr
   torch.utils.hooksr   transformersr   r   transformers.masking_utilsr   transformers.modeling_utilsr   __all__r   r6   r   r7   r   r    r   r   <module>rX      s    % % % % % % % % J J J J J J N N N N N N 2 2 2 2 2 2 < < < < < <             - - - - - - : : : : : : : : C C C C C C ? ? ? ? ? ?   	- ,
 ,
 ,
 ,
 ,
^ ,
 ,
 ,
d?f ? ? ? ?. . . . . .4CC"FF#3Xf5E#EFCC C C C C Cr   