
    PiX                        d dl mZmZ d dlZd dlmc mZ d dlm	Z	 d dl
mZmZ d dlmZmZmZmZmZmZ d dlmZmZ d dlmZ d dlmZ d	d
lmZmZmZ d	dlm Z  d	dl!m"Z"  G d dej        j#                  Z$	 d*dej        j%        de&fdZ'dej        j%        fdZ( G d de          Z) G d de)          Z* G d de$          Z+dej        j%        fdZ,dej        j%        fdZ-dej.        defdZ/de0dej.        defd Z1 G d! d"e)          Z2 G d# d$e$          Z3dej        j%        fd%Z4dej        j%        fd&Z5de0dej.        defd'Z6 G d( d)e)          Z7dS )+    )AnyOptionalN)	is_device)PerGroupPerRow)Int8DynActInt4WeightLinearWeightOnlyInt4Linear_check_linear_int4_k_replace_linear_8da4w_replace_linear_int4 groupwise_affine_quantize_tensor)TorchAODTypeZeroPointDomain)TwoStepQuantizer)get_group_qparams_symmetric   )FakeQuantizeConfigBaseFloat8FakeQuantizeConfigIntxFakeQuantizeConfig)FakeQuantizerBase)_get_qmin_qmaxc                        e Zd ZdZ	 	 	 ddedededee         dee         d	df fd
Zde	j
        d	e	j
        fdZd	e	j        j        fdZe	 	 dde	j        j        dee         dee         fd            Z xZS )FakeQuantizedLineara  
    General linear layer with fake quantized weights and activations.

    Specific target dtypes, granularity, schemes etc. are specified
    through separate configs for weights and activations.

    Example usage::

        activation_config = IntxFakeQuantizeConfig(
            dtype=torch.int8,
            granularity="per_token",
            is_symmetric=False,
        )
        weight_config = IntxFakeQuantizeConfig(
            dtype=torch.int4,
            group_size=8,
            is_symmetric=True,
        )
        fq_linear = FakeQuantizedLinear(
            16, 32, False, activation_config, weight_config,
        )
        fq_linear(torch.randn(16))
    FNin_featuresout_featuresbiasactivation_configweight_configreturnc                     t                      j        |||g|R i | t          j                            d           |t          j        |          | _        nd | _        |rt          |t                    rBt          |j
        t                    r(|j        }|||z  dk    rt          d|d|d          t          j        |          | _        d S d | _        d S )Nz,torchao.quantization.qat.FakeQuantizedLinearr   zin_features (z) % group_size (z) must be == 0)super__init__torch_C_log_api_usage_oncer   from_configactivation_fake_quantizer
isinstancer   granularityr   
group_size
ValueErrorweight_fake_quantizer)
selfr   r   r   r   r   argskwargsr*   	__class__s
            s/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/torchao/quantization/qat/linear.pyr"   zFakeQuantizedLinear.__init__C   s/    		
 		
 	
 	

 	
 	
 	
 	$$%STTT(->-J!. .D** .2D* $-)?@@ Z)8F F  +5
)kJ.F!.K.K$*&;;


4   *;)F})U)UD&&&)-D&&&    xc                     | j         |                      |          }| j        |                     | j                  }n| j        }t          j        ||| j                  S N)r'   r,   weightFlinearr   )r-   r3   ws      r1   forwardzFakeQuantizedLinear.forwardl   s[    )5..q11A%1**4;77AAAx1di(((r2   c                    t           j                            | j        | j        | j        d u| j        j        | j        j                  }| j        j        t          j        d          k    r| j        |_        | j        |_        |S )Ndevicedtypemeta)	r#   nnLinearr   r   r   r6   r=   r>   )r-   
new_linears     r1   	to_linearzFakeQuantizedLinear.to_linearu   sw    X__IT!;%+# % 
 

 ;f!5!555 $J"iJOr2   modc           	          t          |j        |j        |j        d u|||j        j        |j        j                  }|j        j        t          j        d          k    r|j        |_        |j        |_        |S )Nr   r   r=   r>   r?   )r   r   r   r   r6   r=   r>   r#   )clsrD   r   r   rB   s        r1   from_linearzFakeQuantizedLinear.from_linear   sx     )OHD /':$*"
 
 

 :V 4 444 #
J!hJOr2   )FNN)NN)__name__
__module____qualname____doc__intboolr   r   r"   r#   Tensorr:   r@   rA   rC   classmethodrH   __classcell__r0   s   @r1   r   r   *   s=        8 >B:>'. '.'. '. 	'.
 $$:;'.   67'. 
'. '. '. '. '. '.R) )%, ) ) ) )58?       ?C:>	 X_ $$:;   67	   [    r2   r   TrD   enabledc                     t          | t                    r(| j        || j        _        | j        || j        _        dS dS dS )zO
    Helper function to enable fake quantization in `FakeQuantizedLinear`.
    N)r(   r   r'   rS   r,   )rD   rS   s     r1   enable_linear_fake_quantrU      sW     #*++ 8(44;C)1$007C%---	8 8 10r2   c                 (    t          | d           dS )zP
    Helper function to disable fake quantization in `FakeQuantizedLinear`.
    F)rS   N)rU   rD   s    r1   disable_linear_fake_quantrX      s     S%000000r2   c                   B    e Zd ZdZdee         fdZdee         fdZdS )_LegacyQATQuantizerzM
    Base class for sharing common methods across legacy QAT quantizers.
    r   c                     d S r5    r-   s    r1   #get_activation_fake_quantize_configz7_LegacyQATQuantizer.get_activation_fake_quantize_config       tr2   c                     d S r5   r\   r]   s    r1   get_weight_fake_quantize_configz3_LegacyQATQuantizer.get_weight_fake_quantize_config   r_   r2   N)rI   rJ   rK   rL   r   r   r^   ra   r\   r2   r1   rZ   rZ      s`         X>T5U    :P1Q      r2   rZ   c                   :    e Zd ZdZddej        ej        fdededej        dej        dd	f
 fd
Z	dej
        j        dededej
        j        fdZdej
        j        dededej
        j        fdZdej
        j        fdZdee         fdZdee         fdZ xZS ) Int8DynActInt4WeightQATQuantizerz
    Quantizer for performing QAT on a model, where linear layers have int8
    dynamic per token fake quantized activations and int4 fake quantized
    grouped per channel weights.
       F	groupsizepadding_allowed	precisionscales_precisionr   Nc                     t                                                       t          j                            d           || _        || _        || _        || _        t          j	        | _
        d S )Nz9torchao.quantization.qat.Int8DynActInt4WeightQATQuantizer)r!   r"   r#   r$   r%   re   rf   rg   rh   float32activation_scales_precision)r-   re   rf   rg   rh   r0   s        r1   r"   z)Int8DynActInt4WeightQATQuantizer.__init__   sh     	$$G	
 	
 	
 (%4&/-=+0=(((r2   modelr.   r/   c           	      d    t          || j        | j        | j        | j        t
          d           |S )NT)copy_weights)r   re   rf   rg   rh   Int8DynActInt4WeightQATLinearr-   rl   r.   r/   s       r1   preparez(Int8DynActInt4WeightQATQuantizer.prepare   sA     	N N!)	
 	
 	
 	
 r2   c                 0    |                      |           |S r5   )_convert_qat_linear_8da4wrp   s       r1   convertz(Int8DynActInt4WeightQATQuantizer.convert   s     	&&u---r2   modulec           
         |                                 D ]/\  }}t          |t                    r|j        j        }t          |j        |j        |j        du|j	        |j
        j        |j                  }t          |||           d}t          |          \  }}t          |j
        ||j	        |j                  \  }	}
|
                    |j                  }
ddlm}  ||j
        |	|
||t(          j        |j	                  }||_
        |	|_        |
|_        |j        |j        |_        |                     |           1dS )z`
        Replace all `Int8DynActInt4WeightQATLinear` with `Int8DynActInt4WeightLinear`.
        N)re   rg   rh      )rg   r   )8_quantized_decomposed_quantize_per_channel_group_wrapper)named_childrenr(   ro   r,   configr   r   r   r   r*   r6   r>   scale_precisionsetattrr   r   tozero_point_precisiontorchao._executorch_opsrx   r#   int8scaleszerosrs   )r-   ru   namechildrz   quantized_linearn_bitqminqmaxszprx   q_weights                r1   rs   z:Int8DynActInt4WeightQATQuantizer._convert_qat_linear_8da4w   s    "0022 *	6 *	6KD%%!>?? )64;#=%&Jd*$/#l0%+%;$ $ $  &6777 -e44t5L%$4	  B UU6677      TSLJ%  +3 '*+ ')+ &:),1J$)..u5555U*	6 *	6r2   c                 *    t          | j                  S r5   )_get_8da4w_activation_configrk   r]   s    r1   r^   zDInt8DynActInt4WeightQATQuantizer.get_activation_fake_quantize_config&  s    +D,LMMMr2   c                 6    t          | j        | j                  S r5   )_get_8da4w_weight_configre   rh   r]   s    r1   ra   z@Int8DynActInt4WeightQATQuantizer.get_weight_fake_quantize_config)  s    '8MNNNr2   )rI   rJ   rK   rL   r#   rj   rM   rN   r>   r"   r@   Moduler   rq   rt   rs   r   r   r^   ra   rQ   rR   s   @r1   rc   rc      sz          %!&(-9 99 9 ;	9
  +9 
9 9 9 9 9 9$X_-0<?	   X_-0<?	   .6 .6 .6 .6 .6`NX>T5U N N N NO:P1Q O O O O O O O Or2   rc   c                        e Zd ZdZdddej        ej        fdedededej        d	ed
ej	        dej	        ddf fdZ
ddefdZd Z xZS )ro   a  
    This module implements a linear layer with int8 dynamic per token fake
    quantized activations with int4 fake quantized grouped per channel weights.

    args:
        groupsize: the number of elements in each quantized group for weights
        precision: precision of weights
        scales_precision: precision of per group scales and zero points

    Note: we hardcode activation scales to use torch.fp32, but allow users to specify the weight scales (defaults to torch.fp32).
    To get an exact numerical match with Int8DynamicActivationInt4WeightConfig, users must use the same dtype for both the weights
    and the scales. Here scales_precision refers specifically to the weight scales only, not the activation scales.
    FNrd   r   r   r   r=   re   rg   rh   r   c           	          t          t          j                  }t          ||          }	t	                                          |||||	||           d S )Nr<   )r   r#   rj   r   r!   r"   )r-   r   r   r   r=   re   rg   rh   r   r   r0   s             r1   r"   z&Int8DynActInt4WeightQATLinear.__init__<  se     9GG0<LMM 	 	
 	
 	
 	
 	
r2   TrS   c                 6    || j         _        || j        _        d S r5   r'   rS   r,   r-   rS   s     r1   enable_fake_quantz/Int8DynActInt4WeightQATLinear.enable_fake_quantU      18&.-4"***r2   c                 0    |                      d           d S NFr   r]   s    r1   disable_fake_quantz0Int8DynActInt4WeightQATLinear.disable_fake_quantY      u%%%%%r2   T)rI   rJ   rK   rL   r#   rj   rM   rN   r=   r>   r"   r   r   rQ   rR   s   @r1   ro   ro   -  s         $ #!&(-
 

 
 	

 
 
 ;
  +
 

 
 
 
 
 
25 5 5 5 5 5& & & & & & &r2   ro   c                 \    t          | t                    r|                                  dS dS )zT
    (deprecated) Enable fake quantization for `Int8DynActInt4WeightQATLinear`.
    N)r(   ro   r   rW   s    r1   enable_8da4w_fake_quantr   ^  s9     #455     r2   c                 \    t          | t                    r|                                  dS dS )zU
    (deprecated) Disable fake quantization for `Int8DynActInt4WeightQATLinear`.
    N)r(   ro   r   rW   s    r1   disable_8da4w_fake_quantr   g  s9     #455 !     ! !r2   qparams_precisionr   c                     | t           j        k    sJ t          t           j        ddd| | t          j        |           j                  S )z`
    Return the activation `IntxFakeQuantizeConfig` for `Int8DynActInt4WeightQATQuantizer`.
    	per_tokenFT)r>   r)   is_symmetric
is_dynamicr{   r~   eps)r#   rj   r   r   finfor   )r   s    r1   r   r   o  sS     ----!j).K)**.   r2   r*   c                 @    t          t          j        | dd||          S )z\
    Return the weight `IntxFakeQuantizeConfig` for `Int8DynActInt4WeightQATQuantizer`.
    T)r>   r*   r   r   r{   r~   )r   r   INT4r*   r   s     r1   r   r     s0     ").   r2   c                   .    e Zd ZdZddej        ej        fdedee         dej        dej        dd	f
 fd
Z	dej
        j        dededej
        j        fdZdej
        j        dededej
        j        fdZdej
        j        fdZdee         fdZ xZS )Int4WeightOnlyQATQuantizerz
    Quantizer for performing QAT on a model, where linear layers have
    int4 fake quantized grouped per channel weights.
    rd      re   inner_k_tilesrg   rh   r   Nc                     t                                                       t          j                            d           |dv sJ |dv sJ || _        || _        || _        || _        d S )Nz3torchao.quantization.qat.Int4WeightOnlyQATQuantizer)   rw   r   )    @      rd   )	r!   r"   r#   r$   r%   r   re   rg   rh   )r-   re   r   rg   rh   r0   s        r1   r"   z#Int4WeightOnlyQATQuantizer.__init__  s     	$$A	
 	
 	
 	)))).....*"" 0r2   rl   r.   r/   c           
      f    t          || j        | j        d| j        | j        t
          d           |S )NT)rf   rg   rh   linear_classrn   )r   re   r   rg   rh   Int4WeightOnlyQATLinearrp   s       r1   rq   z"Int4WeightOnlyQATQuantizer.prepare  sD     	N n!20		
 		
 		
 		
 r2   c                 0    |                      |           |S r5   )_convert_qat_linear_4wrp   s       r1   rt   z"Int4WeightOnlyQATQuantizer.convert  s     	##E***r2   ru   c                 H   |                                 D ]\  }}t          |t                    rZ|j        }|j        }|j        }|j        j        }t          ||d|j	        ||j
        j        |j        t          |                                          j                  }t!          |||           d}	t#          |j
        |	|j	                  \  }
}t%          |
j        j        d          rHt(          j        j                            |
                    |j
        j                  |j                  }
nGt(          j        j                            |
                    |j
        j                  |j                  }
|
|_
        ||_        v|                     |           dS )zT
        Replace all `Int4WeightOnlyQATLinear` with `WeightOnlyInt4Linear`.
        F)r   re   r   rg   rh   r=   rw   cpuN)ry   r(   r   r   r   r   r,   rz   r	   r*   r6   r>   r{   next
parametersr=   r|   r   r   typer#   opsaten#_convert_weight_to_int4pack_for_cpur}   _convert_weight_to_int4packscales_and_zerosr   )r-   ru   r   r   r   r   r   rz   r   r   r   r   s               r1   r   z1Int4WeightOnlyQATQuantizer._convert_qat_linear_4w  s    "0022 &	3 &	3KD%%!899 %3#/$1 % 34;#7 $/"/#l0%+%; 0 0 2 233:	$ 	$ 	$  &6777 /OL%0 0,+
 X_1599 	$y~QQ EL$788+   HH
  %y~II EL$788+   H +3 '4D 11++E2222M&	3 &	3r2   c                 6    t          | j        | j                  S r5   )_get_4w_weight_configre   rh   r]   s    r1   ra   z:Int4WeightOnlyQATQuantizer.get_weight_fake_quantize_config  s    $T^T5JKKKr2   )rI   rJ   rK   rL   r#   bfloat16rM   r   r>   r"   r@   r   r   rq   rt   r   r   ra   rQ   rR   s   @r1   r   r     sX         '(!&(-1 11  }1 ;	1
  +1 
1 1 1 1 1 1$X_-0<?	   X_-0<?	   *3UX_ *3 *3 *3 *3XL:P1Q L L L L L L L Lr2   r   c                        e Zd ZdZddddej        ej        fdededed	ej        d
ededej	        dej	        ddf fdZ
ddefdZd Z xZS )r   a  
    This module implements a linear layer with int4 fake quantized grouped
    per channel weights, with forward numerics matching `WeightOnlyInt4Linear`,
    which uses the efficient int4 tinygemm kernel.

    args:
        groupsize: the number of elements in each quantized group for weights
        precision: precision of weights
        scales_precision: precision of per group scales and zero points
    FNrd   r   r   r   r   r=   re   r   rg   rh   r   c	           	          |t           j        k    s
J d            t          |||          st          d          || _        t          ||          }	t                                          |||d |	||           d S )Nz!only bf16 is supported for scalesz'Padding for QAT 4w is not supported yetrF   )r#   r   r
   r+   r   r   r!   r"   )r-   r   r   r   r=   re   r   rg   rh   r   r0   s             r1   r"   z Int4WeightOnlyQATLinear.__init__  s      5>1113V111#KMJJ 	HFGGG*-i9IJJ"' 	 	
 	
 	
 	
 	
r2   TrS   c                 6    || j         _        || j        _        d S r5   r   r   s     r1   r   z)Int4WeightOnlyQATLinear.enable_fake_quant  r   r2   c                 0    |                      d           d S r   r   r]   s    r1   r   z*Int4WeightOnlyQATLinear.disable_fake_quant  r   r2   r   )rI   rJ   rK   rL   r#   r   rM   rN   r=   r>   r"   r   r   rQ   rR   s   @r1   r   r     s        	 	 #!&(-
 

 
 	

 
 
 
 ;
  +
 

 
 
 
 
 
45 5 5 5 5 5& & & & & & &r2   r   c                 \    t          | t                    r|                                  dS dS )zN
    (deprecated) Enable fake quantization for `Int4WeightOnlyQATLinear`.
    N)r(   r   r   rW   s    r1   enable_4w_fake_quantr   $  s9     #.//     r2   c                 \    t          | t                    r|                                  dS dS )zO
    (deprecated) Disable fake quantization for `Int4WeightOnlyQATLinear`.
    N)r(   r   r   rW   s    r1   disable_4w_fake_quantr   -  s9     #.// !     ! !r2   c           	      V    t          t          j        | dd||t          j                  S )zV
    Return the weight `IntxFakeQuantizeConfig` for `Int4WeightOnlyQATQuantizer`.
    FT)r>   r*   r   r   r{   r~   zero_point_domain)r   r#   uint4r   FLOATr   s     r1   r   r   5  s5     "k).)/   r2   c                       e Zd ZdZdej        fdee         dej        fdZ	dej
        j        deded	ej
        j        fd
Zdej
        j        deded	ej
        j        fdZd	ee         fdZd	ee         fdZdS )Float8ActInt4WeightQATQuantizera  
    QAT quantizer for applying dynamic rowwise float8 activation + int4
    per group/channel symmetric weight fake quantization to linear layers
    in the model. Currently only supports rowwise granularity for float8
    activations.

    args:
        group_size (Optional[int]): the number of elements in each quantized
            group for weights, defaults to 64. Use None for per channel.
        scale_precision: precision of weight scales, defaults to torch.bfloat16.
    r   r*   r{   c                     t           j                            d           |d}nd}t          t           j        t                                | _        t          t           j        ||dd|          | _	        d S )Nz8torchao.quantization.qat.Float8ActInt4WeightQATQuantizer	per_groupper_channel)r>   r)   T)r>   r)   r*   r   r   r{   )
r#   r$   r%   r   float8_e4m3fnr   _activation_configr   int4_weight_config)r-   r*   r{   weight_granularitys       r1   r"   z(Float8ActInt4WeightQATQuantizer.__init__Y  s    
 	$$F	
 	
 	
 !!,!.":%#
 #
 #
 5**!+
 
 
r2   rl   r.   r/   r   c                    |                                 D ]r\  }}t          |t          j        j                  r9t
                              || j        | j                  }t          |||           ]| 
                    |           s|S )z
        Swap all `nn.Linear` with `FakeQuantizedLinear` with float8
        fake quantizer for activations and int4 fake quantizer for weights.
        )r   r   )ry   r(   r#   r@   rA   r   rH   r   r   r|   rq   )r-   rl   r.   r/   r   r   rB   s          r1   rq   z'Float8ActInt4WeightQATQuantizer.preparer  s     !//11 		$ 		$KD%%11 $0<<&*&="&"5 =  

 tZ0000U####r2   c                     t           r5   NotImplementedErrorrp   s       r1   rt   z'Float8ActInt4WeightQATQuantizer.convert  s
     "!r2   c                      t          d          )Nz,Float8 FakeQuantizeConfig does not exist yetr   r]   s    r1   r^   zCFloat8ActInt4WeightQATQuantizer.get_activation_fake_quantize_config  s    !"PQQQr2   c                     | j         S r5   )r   r]   s    r1   ra   z?Float8ActInt4WeightQATQuantizer.get_weight_fake_quantize_config  s    !!r2   N)rI   rJ   rK   rL   r#   r   r   rM   r>   r"   r@   r   r   rq   rt   r   r^   ra   r\   r2   r1   r   r   L  s       
 
 %'',~
 
SM
 
 
 
 
2X_-0<?	   ("X_"-0"<?"	" " " "
RX>T5U R R R R":P1Q " " " " " "r2   r   r   )8typingr   r   r#   torch.nn.functionalr@   
functionalr7   torchao.dtypes.utilsr    torchao.quantization.granularityr   r   )torchao.quantization.linear_quant_modulesr   r	   r
   r   r   r   %torchao.quantization.quant_primitivesr   r   torchao.quantization.unifiedr   torchao.quantization.utilsr   fake_quantize_configr   r   r   fake_quantizerr   utilsr   rA   r   r   rN   rU   rX   rZ   rc   ro   r   r   r>   r   rM   r   r   r   r   r   r   r   r\   r2   r1   <module>r      s=   !                         * * * * * * = = = = = = = =                       : 9 9 9 9 9 B B B B B B         
          
q q q q q%(/ q q ql 8 8	88 8 8 8158? 1 1 1 1	 	 	 	 	* 	 	 	"aO aO aO aO aO': aO aO aOH-& -& -& -& -&$7 -& -& -&b         !%(/ ! ! ! !{   &{    ,ZL ZL ZL ZL ZL!4 ZL ZL ZLz+& +& +& +& +&1 +& +& +&^ eho        !ux ! ! ! !{    .C" C" C" C" C"&9 C" C" C" C" C"r2   