
    Pi=              	          d dl mZmZ d dlmZ d dlmZmZ 	 d dlm	Z	 n# e
$ r	 d dlmZ	 Y nw xY wd dlmZmZmZ 	 d dlmZmZ d dlmZmZmZmZ n# e
$ r d d	lmZmZmZmZmZmZ Y nw xY wg d
ZdZ	 d dlmZ n# e
$ r dZY nw xY wi Zi Zi Z G d d          Zdee<   dee<   eed<   eed<    G d d          Z dee <   dee<   eed<   eed<    G d de          Z!eZ"eZ#dee!<   e"ed<   e#ed<    G d de          Z$eZ%eZ&dee$<   e%ed<   e&ed<   dee         dee'         fdZ(de'defd Z)de'defd!Z*	 	 d(d#ej+        d$ed%         d&ed%         dd"fd'Z,d"S ))    )CallableOptional)nn)
LoRALinearQATLoRALinear)TensorCoreTiledLayout)TensorCoreTiledLayoutType)int4_weight_only#int8_dynamic_activation_int4_weight	quantize_)Int4WeightOnlyQATQuantizer Int8DynActInt4WeightQATQuantizer)disable_4w_fake_quantdisable_8da4w_fake_quantenable_4w_fake_quantenable_8da4w_fake_quant)r   r   r   r   r   r   )get_quantizer_modeInt4WeightOnlyQuantizerr   $Int4WeightOnlyQATQuantizerModuleSwapInt8DynActInt4WeightQuantizerr   *Int8DynActInt4WeightQATQuantizerModuleSwapT)qatFc                   &    e Zd ZdZddefdZd ZdS )r   z
    Quantizer for applying int8 per token dynamic activation + int4
    per group weight quantization to linear layers in the model.
       	groupsizec                     || _         d S N)r   )selfr   s     s/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/torchtune/training/quantization.py__init__z&Int8DynActInt4WeightQuantizer.__init__S   s    "    c                 N    t          | j                  }t          ||           |S r   )r   r   r   )r   modelquantize_fns      r   quantizez&Int8DynActInt4WeightQuantizer.quantizeV   s&    9$.II%%%%r!   N)r   __name__
__module____qualname____doc__intr    r%    r!   r   r   r   M   sM         
# ## # # # #    r!   r   8da4wz	8da4w-qatc                   *    e Zd ZdZd	dedefdZd ZdS )
r   z
    Quantizer for applying int4 per group weight only quantization
    to linear layers in the model using the efficient tinygemm kernel.
          r   inner_k_tilesc                 "    || _         || _        d S r   )r   r1   )r   r   r1   s      r   r    z Int4WeightOnlyQuantizer.__init__m   s    "*r!   c                 x    t          | j                  }t          | j        |          }t	          ||           |S r   )r   r1   r
   r   r   )r   r#   layout_typer$   s       r   r%   z Int4WeightOnlyQuantizer.quantizeq   s8    +D,>??&t~{CC%%%%r!   N)r/   r0   r&   r,   r!   r   r   r   g   sT         
+ +# +C + + + +    r!   r   4wz4w-qatc                       e Zd ZdS )r   Nr'   r(   r)   r,   r!   r   r   r              Dr!   r   z4w-qat-module-swapc                       e Zd ZdS )r   Nr7   r,   r!   r   r   r      r8   r!   r   z8da4w-qat-module-swap	quantizerreturnc                 R    t                               t          |           d          S )a  Given a quantizer object, returns a string that specifies the type of quantization.

    For example, in the case of int4 weight only quantization, we'll return "4w".
    If the quantizer is not recognized as a known quantizer, we'll return None.

    Currently supported:

    - :class:`~torchtune.training.quantization.Int8DynActInt4WeightQuantizer`: "8da4w"
    - :class:`~torchtune.training.quantization.Int4WeightOnlyQuantizer`: "4w"
    - :class:`~torchao.quantization.qat.Int8DynActInt4WeightQATQuantizer`: "8da4w-qat"
    - :class:`~torchao.quantization.qat.Int4WeightOnlyQATQuantizer`: "4w-qat"

    Args:
        quantizer (Optional[Callable]): A callable object that implements the `quantize` method.

    Returns:
        Optional[str]: The quantization mode.
    N)_quantizer_to_modegettype)r:   s    r   r   r      s     & !!$y//4888r!   quantizer_modec                 8    t                               | d          S )zGiven a quantizer mode, return the corresponding function for disabling fake
    quantize in a model prepared by the quantizer.
    If the quantizer is not recognized as a known QAT quantizer, return None.
    N)%_quantizer_mode_to_disable_fake_quantr>   r@   s    r   _get_disable_fake_quantrD      s    
 144^TJJJr!   c                 8    t                               | d          S )zGiven a quantizer mode, return the corresponding function for enabling fake
    quantize in a model prepared by the quantizer.
    If the quantizer is not recognized as a known QAT quantizer, return None.
    N)$_quantizer_mode_to_enable_fake_quantr>   rC   s    r   _get_enable_fake_quantrG      s    
 033NDIIIr!   Nmoduleactivation_qat_configFakeQuantizeConfigweight_qat_configc                     |                                  D ]S\  }}t          |t                    r(t          j        |||          }t          | ||           Bt          |||           TdS )a`  
    Swap all `LoRALinear` in the model with `QATLoRALinear`.

    This is used for combining QAT + LoRA during finetuning. The resulting linear layers
    will apply the following transformation instead:

        x -> fake_quantize(W_frozen) @ fake_quantize(x) + BAx

    Fake quantization here refers to simulating the quantization numerics without actual
    dtype casting, with the goal of providing improved accuracies when the model is
    ultimately quantized after finetuning.

    Args:
        module (nn.Module): The model to swap linear layers on
        activation_qat_config (Optional[FakeQuantizeConfig]): The config for specifying
            how to fake quantize input activations in the base linear layer
        weight_qat_config (Optional[FakeQuantizeConfig]): The config for specifying
            how to fake quantize base linear weights
    N)named_children
isinstancer   r   from_lora_linearsetattrswap_lora_linear_with_qat)rH   rI   rK   namechild
new_linears         r   rQ   rQ      s    4 ,,..  eeZ(( 	&7%! J
 FD*----%%!    r!   )NN)-typingr   r   torchr   torchtune.modules.peft.lorar   r   torchao.dtypesr   ImportErrorr	   torchao.quantizationr
   r   r   torchao.quantization.qatr   r   torchao.quantization.qat.linearr   r   r   r   "torchao.quantization.prototype.qat__all___torchao_0_7_supportedr   r=   rB   rF   r   r   r   !disable_4w_fake_quant_module_swap enable_4w_fake_quant_module_swapr   $disable_8da4w_fake_quant_module_swap#enable_8da4w_fake_quant_module_swapstrr   rD   rG   ModulerQ   r,   r!   r   <module>rf      s~   & % % % % % % %       A A A A A A A AR4444444 R R RQQQQQQQQR                              	 	 	                 	    #((((((( # # #"#  (* %') $        5< 0 17B 3 45M %k 24K $[ 1       " /3 * +19 - .2G %h /1E $X .	 	 	 	 	+E 	 	 	 %: !#7  ;O 7 8 & &
 % %	 	 	 	 	1Q 	 	 	 (@ $&= #AX = > ) &
 ( %
9(8"4 9# 9 9 9 9,KC KH K K K KJ3 J8 J J J J =A8<' 'I' $$89	'
   45' 
' ' ' ' ' 's/    --A A('A(2A9 9BB