
    Pi2                     B   d dl mZ d dlZd dlmZmZmZmZ d dlm	Z	m
Z
mZmZmZmZmZmZmZ d dlmZmZmZmZ ddlmZmZmZmZ ddlmZmZmZ  G d	 d
ej         j!                  Z" G d de"          Z# G d de"          Z$ G d de"          Z% G d de%          Z&dS )    )OptionalN)PerAxisPerGroupPerRowPerToken)	_DTYPE_TO_BIT_WIDTH_DTYPE_TO_QVALUE_BOUNDSMappingType_choose_scale_float8_dequantize_affine_float8_fake_quantize_affine_quantize_affine_float8_Roundchoose_qparams_affine)_get_per_token_block_sizeget_block_sizeget_group_qparams_symmetricget_groupwise_affine_qparams   )FakeQuantizeConfigBaseFloat8FakeQuantizeConfigInt4WeightFakeQuantizeConfigIntxFakeQuantizeConfig) _fake_quantize_per_channel_group_fake_quantize_per_token_log_deprecation_warningc                   J    e Zd ZU dZeed<   defdZededd fd            Z	dS )FakeQuantizerBasez`
    Generic module for applying fake quantization to a tensor, as specified in the config.
    configreturnc                     d| j         z  S )ze
        Return a human readable representation of this `FakeQuantizer` with config details.
        zFakeQuantizer(%s)r   selfs    {/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/torchao/quantization/qat/fake_quantizer.py__repr__zFakeQuantizerBase.__repr__7   s     #T[00    c                     t          | t                    rt          |           S t          | t                    rt	          |           S t          | t
                    rt          |           S t          d|            )NzUnknown config type: )
isinstancer   IntxFakeQuantizerr   Int4WeightFakeQuantizerr   Float8FakeQuantizer
ValueErrorr"   s    r%   from_configzFakeQuantizerBase.from_config=   s|    f455 	?$V,,, <== 	?*6222 899 	?&v...=V==>>>r'   N)
__name__
__module____qualname____doc__r   __annotations__strr&   staticmethodr.    r'   r%   r   r   0   sy           #"""1# 1 1 1 1 ?2 ?7J ? ? ? \? ? ?r'   r   c                   L     e Zd ZdZdef fdZdej        dej        fdZ xZ	S )r,   zg
    Generic module for applying float8 fake quantization to a tensor, as specified in the config.
    r   c                     t                                                       || _        t          j                            d           d S )Nz,torchao.quantization.qat.Float8FakeQuantizersuper__init__r   torch_C_log_api_usage_oncer$   r   	__class__s     r%   r;   zFloat8FakeQuantizer.__init__N   s=    $$%STTTTTr'   xr    c                    |j         }t          |j        | j        j                  }t          ||| j        j         | j        j        | j        j                  }t          ||| j        j                   }t          |||          }|S )N)hp_value_lbhp_value_ub)
dtyper   shaper   granularityr   rC   rD   r   r   )r$   rA   original_dtype
block_sizescaleqdqs          r%   forwardzFloat8FakeQuantizer.forwardS   s    #AGT[-DEE
$K//
 
 
 $Audk.?@@&q%@@	r'   )
r/   r0   r1   r2   r   r;   r<   TensorrM   __classcell__r@   s   @r%   r,   r,   I   s|         U7 U U U U U U
 %,        r'   r,   c                        e Zd ZdZdef fdZdej        dej        fdZdej        dej        fdZ	dej        dej        fdZ
 xZS )	r+   a   
    Generic module for applying int4 fake quantization to a weight tensor,
    targeting the following FBGEMM kernels:
        torch.ops.fbgemm.f8i4bf16_shuffled
        torch.ops.fbgemm.bf16i4bf16_shuffled
        torch.ops.fbgemm.bf16i4bf16_rowwise
    r   c                     t                                                       || _        t          j                            d           d S )Nz0torchao.quantization.qat.Int4WeightFakeQuantizerr9   r?   s     r%   r;   z Int4WeightFakeQuantizer.__init__k   s=    $$%WXXXXXr'   wr    c                     | j         j        t          j        k    r|                     |          S | j         j        t          j        k    r|                     |          S t          d| j         j                   )NzUnknown activation dtype )r   activation_dtyper<   float8_e4m3fn_fp8_activations_forwardbfloat16_bf16_activations_forwardr-   )r$   rS   s     r%   rM   zInt4WeightFakeQuantizer.forwardp   sk    ;'5+>>>00333[)U^;;11!444W9UWWXXXr'   c           	         |                                 dk    sJ | j        j        t          j        k    sJ t          |j        t                                }t          ||t          j        d          }t          ||t          j                  }t          |||j                  }d}d}|                    |j        d         d| j        j                  }t          j        t          j        |          dd	          }t          j        ||z  |
          }	t          j        |	          }
d| j        j        f}t%          |||	|
t          j        dd          }|                    |j                  S )a:  
        Apply int4 fake quantization to the weight tensor where the input activations
        are expected to be rowwise fp8, using the following as a reference:
        https://github.com/pytorch/FBGEMM/blob/80cc48c4b2b7fcc579e53211fc8715a8592cbd2c/fbgemm_gpu/experimental/gen_ai/gen_ai/quantize.py#L136
           g-q=)rC   ư>   r   Fdimkeepdimminr   i   )quant_dtype	quant_min	quant_max)r`   r   rU   r<   rV   r   rF   r   r   r   r   rE   view
group_sizeamaxabsclamp
zeros_liker   int8to)r$   rS   per_row_block_size	fp8_scalew_fp8epsfbgemm_scale_quant_maxw_fp8_groupedmax_absrJ   
zero_pointper_group_block_sizefqs                r%   rW   z0Int4WeightFakeQuantizer._fp8_activations_forwardx   s\    uuww!||||{+u/BBBBB ,AGVXX>>(	
 
 
	 (9e6IJJ)%AGDD !"

5;q>2t{7MNN*UY}552uMMMG&<<#FFF%e,,
 !4;#9:" 

 
 
 uuQW~~r'   c                    |                                 dk    sJ | j        j        t          j        k    sJ d}d\  }}d}|                    t          j                                      |j        d         d| j        j	                  }t          j
        |dd          }t          j        |dd          }t          j        ||z
  |	          |z  }	||	|z  z   }
t          j        ||z
  |	z                                ||          }||z
  }||	z  |
z   }|                    |j                                      |j                  S )
a3  
        Apply int4 fake quantization to the weight tensor where the input activations
        are expected to be bf16, using the following as a reference:
        https://github.com/pytorch/FBGEMM/blob/80cc48c4b2b7fcc579e53211fc8715a8592cbd2c/fbgemm_gpu/experimental/gen_ai/gen_ai/quantize.py#L152
        r[   r\   )r      r]   r   r^   Tr_   rb   )r`   r   rU   r<   rX   ro   float32rh   rF   ri   rj   aminrl   r   applyrE   )r$   rS   rs   qminqmaxfbgemm_symmetric_qmax	w_groupedmax_valmin_valrJ   rw   ry   s               r%   rY   z1Int4WeightFakeQuantizer._bf16_activations_forward   s5    uuww!||||{+u~====
d !DD'',,QWQZT[=STT	*YB===*YB===Gg-3777$>u'<<<
\9w.%788>>tTJJ''%Z*$wwqw""17+++r'   )r/   r0   r1   r2   r   r;   r<   rN   rM   rW   rY   rO   rP   s   @r%   r+   r+   b   s         Y; Y Y Y Y Y Y
Y Y%, Y Y Y Y'%, '5< ' ' ' 'R,5< ,EL , , , , , , , ,r'   r+   c                        e Zd ZdZdef fdZdej        dej        fdZdej        dej        fdZ	dej        dej        fdZ
defd	ZddZ xZS )r*   zh
    Generic module for applying integer fake quantization to a tensor, as specified in the config.
    r   c                     t                                                       t          j                            d           || _        d| _        d | _        d | _        d| _	        d| _
        d S )Nz*torchao.quantization.qat.IntxFakeQuantizerTg&.>F)r:   r;   r<   r=   r>   r   enabledrJ   rw   
_scale_eps_initializedr?   s     r%   r;   zIntxFakeQuantizer.__init__   sc    $$%QRRR-1
26 !r'   rA   r    c                    | j         s|S | j        j        r$| j        s| j        | j        t          d          t          | j        j        t                    r| 
                    |          S t          | j        j        t          t          f          r|                     |          S t          d| j        j        z            )z
        Apply fake quantization to the tensor based on the bit-width,
        granularity, symmetry, and other properties specified in the config.
        NzScales and zero points must be initialized for range learning. Please call `torchao.quantization.qat.initialize_fake_quantizers` before initializing the optimizer and beginning training.zUnknown granularity '%s')r   r   range_learningr   rJ   rw   r-   r)   rG   r   _per_token_forwardr   r   _per_channel_or_group_forward)r$   rA   s     r%   rM   zIntxFakeQuantizer.forward   s    
 | 	H K&		%		 #t'>L   dk-x88 	S**1---/'81DEE 	S55a8887$+:QQRRRr'   c                    | j         j        rt          d          t          | j         j                 \  }}|                                 rxt          |t          j        t          |          | j         j        ||| j         j
        | j         j        | j         j        	  	        \  | _        | _        |                                  t!          || j        | j        ||          S )zD
        Perform per token fake quantization on the tensor.
        z(Symmetric per token is not supported yet)mapping_typerI   target_dtyperf   rg   rs   scale_dtypezero_point_dtype)r   is_symmetricNotImplementedErrorr	   rE   _should_compute_qparamsr   r
   
ASYMMETRICr   rs   scale_precisionzero_point_precisionrJ   rw   (_maybe_update_qparams_for_range_learningr   )r$   rA   r   r   s       r%   r   z$IntxFakeQuantizer._per_token_forward   s     ;# 	R%&PQQQ,T[->?
d'')) 	<*?(34Q77![.KO K7!%!A
+ 
+ 
+'DJ 99;;;'4:tdSSSr'   c           	      2   | j         j        }| j         j        }| j         j        }| j         j        }| j         j        }t          |t                    r(|j        dk    sJ |	                                d         }n/t          |t                    r|j        }nt          d|z            |                                 rt          | j         j                 }|r,t!          ||||| j         j                  \  | _        | _        n+t)          ||||| j         j                  \  | _        | _        | j                            |          | _        |                                  t.          | j         j                 \  }	}
t1          || j        | j        |	|
||          S )z
        Perform per channel or per group fake quantization on the tensor.
        We express per channel using per group where the group size is the size
        of the last dimension of the tensor.
        r   r^   zUnexpected granularity '%s')rs   )r   rG   r   r   zero_point_domainr   r)   r   axissizer   ri   r-   r   r   rE   r   rs   rJ   rw   r   ro   r   r	   r   )r$   rA   rG   r   r   r   r   ri   	bit_widthr   r   s              r%   r   z/IntxFakeQuantizer._per_channel_or_group_forward   s    k-+5#{? K9{/ k7++ 	J#q(((("JJX.. 	J$/JJ:[HIII '')) 	<+DK,=>I 0K#1 1 1-T__ 1M#1 1 1-T_ #o001EFFDO99;;;,T[->?
d/JO
 
 	
r'   c                 >    | j         j        p| j        du p| j        du S )zO
        Return whether we need to compute new scales and zero points.
        N)r   
is_dynamicrJ   rw   r#   s    r%   r   z)IntxFakeQuantizer._should_compute_qparams1  s(     {%Vt);VtRV?VVr'   Nc                 r   | j         j        rHt          | j        t          j        j                  s$t          | j        t          j        j                  rdS | j        | j        }}t          | j         j	                 \  }}t	          j
        || j                  }t          j                            |d          | _        | j         j        r| j                                         dS t          j        |          }t	          j
        |||          }t          j                            |d          | _        dS )z
        If range learning is enabled, turn scales and zero points into trainable parameters.
        This function is idempotent and should only be called once.
        Nrb   T)requires_grad)r   r   r)   rJ   r<   nn	Parameterrw   r	   rE   rl   r   r   zero_r   r~   )r$   rJ   rw   r   r   s        r%   r   z:IntxFakeQuantizer._maybe_update_qparams_for_range_learning7  s    *	$*eh&899	 $/58+=>>	
 F Jz,T[->?
dEt777X''T'BB
;# 	QO!!#####j11JZt<<J#h0040PPDOOOr'   )r    N)r/   r0   r1   r2   r   r;   r<   rN   rM   r   r   boolr   r   rO   rP   s   @r%   r*   r*      s
        "5 " " " " " "S S%, S S S S4TEL TU\ T T T T,5
u| 5
 5
 5
 5
 5
nW W W W WQ Q Q Q Q Q Q Qr'   r*   c                   (     e Zd ZdZdef fdZ xZS )FakeQuantizerz_
    (Deprecated) Please use :class:`~torchao.quantization.qat.IntxFakeQuantizer` instead.
    r   c                 h    t                                          |           t          |            d S )N)r:   r;   r   r?   s     r%   r;   zFakeQuantizer.__init__U  s/        &&&&&r'   )r/   r0   r1   r2   r   r;   rO   rP   s   @r%   r   r   P  sO         '5 ' ' ' ' ' ' ' ' ' 'r'   r   )'typingr   r<    torchao.quantization.granularityr   r   r   r   %torchao.quantization.quant_primitivesr   r	   r
   r   r   r   r   r   r   torchao.quantization.utilsr   r   r   r   fake_quantize_configr   r   r   r   utilsr   r   r   r   Moduler   r,   r+   r*   r   r6   r'   r%   <module>r      st                    
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
                               ? ? ? ? ? ? ? ?2    +   2S, S, S, S, S,/ S, S, S,lTQ TQ TQ TQ TQ) TQ TQ TQp' ' ' ' '% ' ' ' ' 'r'   