
    Pi'                        d Z ddlZddlmc mZ ddlmZmZ g dZ	ddZ
 G d d	ej        j                  Z G d
 deej        j                  Zej        j        eej        j        j        j        eiZ	 d	 ddZdddZd ZdS )a(  
Testing out accuracy-only implementation of SmoothQuant
(https://arxiv.org/pdf/2211.10438.pdf)
Note: this is an application of input-weight equalization, with the addition that the
multiplication by scale is fused into the preceding layer, specifically for relevant
parts of transformer blocks.
    N   )$_quant_int8_dynamic_per_token_linear dynamically_quantize_per_channel)	get_scaleSmoothFakeDynQuantMixin$SmoothFakeDynamicallyQuantizedLinear!swap_linear_with_smooth_fq_linearsmooth_fq_linear_to_inferenceset_smooth_fq_attribute      ?c                     t          j        | |          }t          j        |d|z
            }||z  }|                    d          S )a  
    Calculate the scale based on abs(max(X)), abs(max(W)), and alpha.

    Args:
        X_absmax (torch.Tensor): Absolute maximum values of the input tensor X.
        W_absmax (torch.Tensor): Absolute maximum values of the weight tensor W.
        alpha (float, optional): Scaling factor. Defaults to 0.5.

    Returns:
        torch.Tensor: The calculated scale of dimension `k` if X is of dimension `b*n*k` and W is of dimension `k*m`.
    g      ?)torchpowreshape)X_absmaxW_absmaxalphaX_powW_powdivs         t/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/torchao/quantization/smoothquant.pyr   r   !   sC     Ih&&EIhe,,E
%-C;;r??    c                   2    e Zd Zd Zd Zd Zd Zd Zd ZdS )r   c                 x    d| _         d | _        |                     dd            || _        d| _        d| _        d S )NTsmooth_scaleF)calibratingx_running_abs_maxregister_bufferr   debug_skip_scalingstore_w_int_repr_t)selfr   s     r   init_smoothquant_variablesz2SmoothFakeDynQuantMixin.init_smoothquant_variables4   sG    !%^T222
"' #(r   c                    t          t          t          |j                  dz
                      }t	          j        t	          j        |          |          }| j        	|| _        d S t	          j        || j                  | _        d S )Nr   dim)	tuplerangelenshaper   amaxabsr   max)r"   Xall_dims_except_lastcur_abs_maxs       r   update_x_running_abs_maxz0SmoothFakeDynQuantMixin.update_x_running_abs_maxH   sv    $U3qw<<!+;%<%<==j13GHHH!)%0D"""%*Y{D<R%S%SD"""r   c                 Z   | j         
J d            | j        }| j        sTt          j        t          j        | j                   |                    dd                                        dd          }t          |ddt          j                  \  }}}|	                                }|||fS )Nz5self.smooth_scale is None, did you turn on inference?r   r   i   )
r   weightr    r   matmuldiag	transposer   int8
contiguous)r"   W
W_int_reprW_scalesW_zpss        r   get_scaled_quantized_wz.SmoothFakeDynQuantMixin.get_scaled_quantized_wQ   s     ,,C -,, K
 & 	
4,--q{{1a/@/@ i1oo  'GtS%*'
 '
#
He  **,,
8U**r   c                     t                      NNotImplementedErrorr"   s    r   to_inferencez$SmoothFakeDynQuantMixin.to_inferencej   s    !###r   c                    |                                  \  }| _        }| j        r=|                     d|                    dd                                                     n(|                     d|                                           | `d S )Nr;   r   r   )r>   r<   r!   r   r7   r9   r4   )r"   r;   _W_zpss      r   fold_weightz#SmoothFakeDynQuantMixin.fold_weightm   s     -1,G,G,I,I)
DM6 " 	H  z/C/CAq/I/I/T/T/V/VWWWW  z/D/D/F/FGGGKKKr   c                     t                      )z
        Sets `self.x_running_abs_max` to a value which will lead to smooth scale
        of all ones if `alpha=0.5`, to enable performance benchmarking without
        calibration.
        rA   rC   s    r   set_debug_x_absmaxz*SmoothFakeDynQuantMixin.set_debug_x_absmaxy   s     "###r   N)	__name__
__module____qualname__r#   r1   r>   rD   rG   rI    r   r   r   r   3   sq        ( ( ((T T T+ + +2$ $ $
 
 
$ $ $ $ $r   r   c                   L     e Zd ZdZ fdZd Zedd            Zd Zd Z	 xZ
S )	r   z
    This is a replacement for `torch.nn.Linear` which implements dynamic per-token
    activation quantization and dynamic per-channel weight quantization based on
    Smoothquant scaling.
    c                     |                     d          } t                      j        |i | |                     |           d S )Nr   )popsuper__init__r#   )r"   argskwargsr   	__class__s       r   rR   z-SmoothFakeDynamicallyQuantizedLinear.__init__   sJ    

7##$)&)))''.....r   c                 4   | j         r6|                     |           t          j        || j        | j                  }nZ| j        s
|| j        z  }| j        r| j	        n| j	        
                                }t          ||| j        | j        |j                  }|S r@   )r   r1   Flinearr4   biasr    r   r!   r;   tr   r<   dtype)r"   r.   rS   rT   YW_int_repr_ts         r   forwardz,SmoothFakeDynamicallyQuantizedLinear.forward   s     	))!,,,DK33AA* * ))#'#:S@Q@Q@S@S  5<	17 A r   r   c                    d\  }} | |||j         du|          }|j        |_        |j        |_        |j        |_        |j         |_         t	          |                                          j        }|                    |           |S )z
        Converts a `mod` of class `torch.nn.Linear` to the smooth fake quantized
        version of it.  Note: requires calibration.
        )   r`   N)rY   r   )rY   in_featuresout_featuresr4   next
parametersdeviceto)clsmodr   fake_in_featuresfake_out_featuresnew_moddevice_to_uses          r   
from_floatz/SmoothFakeDynamicallyQuantizedLinear.from_float   s     /3++#/chd6JRW
 
 
 "o"/xS^^--..5

=!!!r   c                 (   | j         
J d            d| _        t          | j         t          j        t          j        | j                            dd                    d          j        | j	                  | _
        |                                  dS )zl
        Calculates the smoothquant scale based on calibration
        in preparation for inference
        Nzno calibration data foundFr   r   r%   r   )r   r   r   r   r-   r,   r4   r7   valuesr   r   rG   rC   s    r   rD   z1SmoothFakeDynamicallyQuantizedLinear.to_inference   s    
 %113N111 %"Iei 5 5a ; ;<<!DDDK*
 
 

 	r   c                     t          j        t          j        | j                            dd                    d          j        }|| _        d S )Nr   r   r%   )r   r-   r,   r4   r7   rp   r   )r"   w_absmaxs     r   rI   z7SmoothFakeDynamicallyQuantizedLinear.set_debug_x_absmax   sC    9UYt{'<'<Q'B'BCCKKKR!)r   r   )rJ   rK   rL   __doc__rR   r^   classmethodrm   rD   rI   __classcell__)rU   s   @r   r   r      s         / / / / /
  $    [&  * * * * * * *r   r    returnc                    t          |                                           }|                                D ]\  }}|dk    r|}n| d| }|||vrkt          |          t                                          v rCt          t          |                   }|                    ||          }	t          | ||	           t          ||||           dS )a&  
    Replaces linear layers in the model with their SmoothFakeDynamicallyQuantizedLinear equivalents.

    Args:
        model (torch.nn.Module): The model containing linear layers to be replaced.
        skip_fqn_list (list of str, optional): List of fully qualified names to skip during replacement. Defaults to None.
        cur_fqn (str, optional): The current fully qualified name of the module being processed. Defaults to "".
        alpha (float, optional): The scaling factor for SmoothQuant. Defaults to 0.5.

    Returns:
        None
    rw   .Nro   )	dictnamed_childrenitemstypesource_cls_to_target_clskeysrm   setattrr	   )
modelskip_fqn_listcur_fqnr   name_to_childnamechildnew_fqn
target_cls	new_childs
             r   r	   r	      s      --//00M$**,, T Teb==GG ))4))G"}(D(DKK388::::1$u++>J"--e5-AAIE4++++-e]GUSSSST Tr   Fc                     |                                  D ]c\  }}t          |t          t                                                              r*|r|                                 |                                 ddS )a  
    Prepares the model for inference by calculating the smoothquant scale for each SmoothFakeDynamicallyQuantizedLinear layer.

    Args:
        model (torch.nn.Module): The model containing SmoothFakeDynamicallyQuantizedLinear layers.
        debug_skip_calibration (bool, optional): If True, sets the running maximum of activations to a debug value for performance benchmarking.
                                                 Defaults to False.

    Returns:
        None
    N)named_modules
isinstancer'   r   rp   rI   rD   )r   debug_skip_calibration_rh   s       r   r
   r
      s     %%''  3c5!9!@!@!B!BCCDD 	% )&&(((	 r   c                     |                                  D ]Z\  }}t          |t          t                                                              r!t          ||          rt          |||           [d S r@   )r   r   r'   r   rp   hasattrr   )r   attribute_namenew_attribute_valr   rh   s        r   r   r     s~    %%'' @ @3c5!9!@!@!B!BCCDD 	@sN++ @^->???@ @r   rs   )Nrw   r   )rx   N)F)rt   r   torch.nn.functionalnn
functionalrW   utilsr   r   __all__r   Moduler   Linearr   modulesrX   NonDynamicallyQuantizableLinearr   r	   r
   r   rM   r   r   <module>r      sv                    
     $L$ L$ L$ L$ L$eho L$ L$ L$^B* B* B* B* B*+BEHO B* B* B*T 
HO9	H;=a  25T	T T T T@    *@ @ @ @ @r   