
    `iͥ                     V   d dl Z d dlmZ d dlZd dlmZ d dlmZmZ d dl	m
Z
mZ  edd          Zej        ej        ej        ej        ej        gZej        ej        gZd eD             Ze                    d	 eD                        d
 Ze                    d            e
edd          dej        dededededej        dej        fd            Z e
edd          dej        dededededej        dej        fd            Ze                    d            e
edd          dej        dej        dej        dededej        dej        fd            Z  e
edd          dej        dej        dej        dededej        dej        fd            Z!e                    d            e
edd          dej        dej        dej        dej        dej        dej        dej        fd            Z" e
edd          dej        dej        dej        dej        dej        dej        dej        fd            Z#e                    d             e
ed!d          dd"dej        dededededej        d#eej                 dej        fd$            Z$ e
ed!d          dd"dej        dej        dej        dededej        d#eej                 dej        fd%            Z%e                    d&            e
ed'd          dd"dej        dej        dej        dededej        d#eej                 dej        fd(            Z& e
ed'd          dd"dej        dej        dej        dededej        d#eej                 dej        fd)            Z'e                    d*            e
ed+d          dd"dej        dej        dej        dej        dej        dej        d#eej                 dej        fd,            Z( e
ed+d          dd"d#eej                 dej        fd-            Z)e                    d.            e
ed/d          dej        d0ed1ed2edej        de*ej        ej        f         fd3            Z+e                    d4            e
ed5d          dej        d0ed1ed2edej        de*ej        ej        f         fd6            Z, e
ed/d          dej        deded2edej        de*ej        ej        f         fd7            Z- e
ed5d          dej        deded2edej        de*ej        ej        f         fd8            Z.d9 Z/e                    d:            e
ed;d          dej        d<ej        d=ej        d>edededej        dej        fd?            Z0 e
ed;d          dej        d<ej        d=ej        d>edededej        dej        fd@            Z1e                    dA            e
edBd          dd"dej        d<ej        d=eej                 d>edededej        d#eej                 dej        fdC            Z2 e
edBd          dd"dej        d<ej        d=eej                 d>edededej        d#eej                 dej        fdD            Z3e                    dE            e
edFd          dej        dej        de*ej        ej        f         fdG            Z4 e
edFd          dej        dej        de*ej        ej        f         fdH            Z5e                    dI            e
edJdK          dej        dej        de*ej        ej        f         fdL            Z6e                    dM            e
edNd          dej        dej        de*ej        ej        f         fdO            Z7 e
edNd          dej        dej        de*ej        ej        f         fdP            Z8dQ Z9e                    dR            e
edSd          dej        d<ej        d=ej        dededej        fdT            Z: e
edSd          dej        d<ej        d=ej        dededej        fdU            Z;e                    dV            e
edWd          ej<        fdej        d<ej        d=ej        dededej        dXej        fdY            Z= e
edWd          ej<        fdej        d<ej        d=ej        dededej        dXej        fdZ            Z>e                    d[            e
ed\d          	 dpdej        d<ej        d=ej        dededej        fd^            Z? e
ed\d          	 dpdej        d<ej        d=ej        dededej        fd_            Z@e                    d`            e
edad          d]ej<        fdbej        d<ej        d=eej                 dededej        dcedXej        fdd            ZAe                    de            G df dgejB        jC                  ZD e
edhdi          dej        d<ej        d=ej        d>edededej        fdj            ZE e
edhd          dej        d<ej        d=ej        d>edededej        fdk            ZFe                    dl            e
edmd          dej        dej        dej        fdn            ZG e
edmd          dej        dej        dej        fdo            ZHdS )q    N)Optional)_unsqueeze_multiple)determine_qparamsvalidate_qmin_qmax)implLibraryquantized_decomposedDEFc                 t    i | ]5}|t          j        |          j        t          j        |          j        f6S  )torchiinfominmax.0ks     x/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/torch/ao/quantization/fx/_decomposed.py
<dictcomp>r      sB       45AAEKNN./      c           	          i | ]O}|t          t          j        |          j                  t          t          j        |          j                  fPS r   )intr   finfor   r   r   s     r   r   r      sE    RRRqQU[^^	 	 #ek!nn&8"9"9:RRRr   c                     |t           vrt          d|           t           |         \  }}| |k    sJ d| d|              ||k    sJ d| d|             d S )NzUnsupported dtype: z9quant_min out of bound for dtype, quant_min_lower_bound: z quant_min: z9quant_max out of bound for dtype, quant_max_upper_bound: z quant_max: )_DTYPE_TO_QVALUE_BOUNDS
ValueError)	quant_min	quant_maxdtypequant_min_lower_boundquant_max_upper_bounds        r   _quant_min_max_bounds_checkr"      s    +++6u667773J53Q00----	Q"7	Q 	QEN	Q 	Q .--
 ----	Q"7	Q 	QEN	Q 	Q .----r   zxquantize_per_tensor(Tensor input, float scale, int zero_point, int quant_min, int quant_max, ScalarType dtype) -> Tensorquantize_per_tensorCompositeExplicitAutogradinputscale
zero_pointr   r   r   returnc                 z   | j         t          j        t          j        fv r|                     t          j                  } | j         t          j        k    sJ d| j                      t          |||           d|z  }t          j        t          j        | |z            |z   ||                              |          S )a  Affine quantization for the Tensor using the same quantization parameters to map
    from floating point to quantized values

    Args:
       input (torch.Tensor): original float32 or bfloat16 Tensor
       scale (float): quantization parameter for affine quantization
       zero_point (int): quantization parameter for affine quantization
       quant_min (int): minimum quantized value for output Tensor
       quant_max (int): maximum quantized value for output Tensor
       dtype (torch.dtype): requested dtype (e.g. torch.uint8) for output Tensor

    Returns:
       Tensor with requested dtype (e.g. torch.uint8), note the quantization parameters
       are not stored in the Tensor, we are storing them in function arguments instead
    <Expecting input to have dtype torch.float32, but got dtype:       ?)	r   r   float16bfloat16tofloat32r"   clampround)r%   r&   r'   r   r   r   	inv_scales          r   r#   r#   1   s    0 {u}en555'';%-'''Tu{TT (''  	9e<<<eI;EI%&&3Y	 biir   Metac                     | j         t          j        t          j        fv r|                     t          j                  } | j         t          j        k    sJ d| j                      t          j        | |          S )Nr*   r   )r   r   r,   r-   r.   r/   
empty_liker%   r&   r'   r   r   r   s         r   quantize_per_tensor_metar8   V   so     {u}en555'';%-'''Tu{TT ('' E////r   zquantize_per_tensor.tensor(Tensor input, Tensor scale, Tensor zero_point, int quant_min, int quant_max, ScalarType dtype) -> Tensorzquantize_per_tensor.tensorc                 N   |                                 dk    sJ d|                                              |                                 dk    sJ d|                                              t          | |                                |                                |||          S zAffine quantization for the Tensor using the same quantization parameters to map
    from floating point to quantized values
    Same as `quantize_per_tensor` but scale and zero_point are Scalar Tensor instead of
    scalar values
       >Expecting zero_point tensor to be one element, but received : 9Expecting scale tensor to be one element, but received : numelr#   itemr7   s         r   quantize_per_tensor_tensorrA   m   s      """]IYIYI[I[]] #"" ;;==ASEKKMMSS  

  r   c                    | j         t          j        t          j        fv r|                     t          j                  } |                                dk    sJ d|                                             |                                dk    sJ d|                                             | j         t          j        k    sJ d| j                      t          j        | |          S )Nr;   r<   r=   r*   r5   )r   r   r,   r-   r.   r/   r?   r6   r7   s         r   quantize_per_tensor_tensor_metarC      s     {u}en555''"""]IYIYI[I[]] #"" ;;==ASEKKMMSS  ;%-'''Tu{TT ('' E////r   zquantize_per_tensor.tensor2(Tensor input, Tensor scale, Tensor zero_point, Tensor quant_min, Tensor quant_max, ScalarType dtype) -> Tensorzquantize_per_tensor.tensor2c                    |                                 dk    sJ d|                                              |                                 dk    sJ d|                                              t          | |                                |                                |                                |                                |          S r:   r>   r7   s         r   quantize_per_tensor_tensor2rE      s      """]IYIYI[I[]] #"" ;;==ASEKKMMSS  

  r   c                 *    t          | |||||          S N)rC   r7   s         r    quantize_per_tensor_tensor2_metarH      s(     +  r   zdequantize_per_tensor(Tensor input, float scale, int zero_point, int quant_min, int quant_max, ScalarType dtype, *, ScalarType? out_dtype=None) -> Tensordequantize_per_tensor	out_dtyperK   c                    | j         |k    sJ d| d| j                      |t          j        }|t          v r|                     |          |z
  |z  S t          d|           )a  Affine dequantization for the Tensor using the same quantization parameters to map
    from quantized values to floating point values

    Args:
       input (torch.Tensor): Tensor with dtype matching `dtype` argument,
       e.g. (`torch.uint8`), it is a per tensor quantized Tensor if combined with
       quantization parameters in the argument of this function (scale/zero_point)

       scale (float): quantization parameter for affine quantization

       zero_point (int): quantization parameter for affine quantization

       quant_min (int): minimum quantized value for input Tensor (not used in computation,
       reserved for pattern matching)

       quant_max (int): maximum quantized value for input Tensor (not used in computation,
       reserved for pattern matching)

       dtype (torch.dtype): dtype for input Tensor (not used in computation,
       reserved for pattern matching)

       out_dtype (torch.dtype?): optional dtype for output Tensor

    Returns:
       dequantized float32 Tensor
    Expecting input to have dtype: z
, but got N,Unsupported dtype in dequantize_per_tensor: )r   r   r/   r   r.   r   r%   r&   r'   r   r   r   rK   s          r   rI   rI      s    J ;%H%HH5;HH   M	''' ##j0E99OOOPPPr   c                J    |t           j        }t          j        | |          S Nr5   )r   r/   r6   rO   s          r   dequantize_per_tensor_metarR     s'     M	E3333r   zdequantize_per_tensor.tensor(Tensor input, Tensor scale, Tensor zero_point, int quant_min, int quant_max, ScalarType dtype, *, ScalarType? out_dtype=None) -> Tensorzdequantize_per_tensor.tensorc          	      R   |                                 dk    sJ d|                                              |                                 dk    sJ d|                                              t          | |                                |                                ||||          S zAffine dequantization for the Tensor using the same quantization parameters to map
    from quantized values to floating point values
    Same as `dequantize_per_tensor` but scale and zero_point are Scalar Tensor instead of
    scalar values
    r;   r<   r=   rJ   r?   rI   r@   rO   s          r   dequantize_per_tensor_tensorrV   1  s    ( """]IYIYI[I[]] #"" ;;==ASEKKMMSS  !

   r   c                   |t           j        }|                                dk    sJ d|                                             |                                dk    sJ d|                                             | j        |k    sJ d|             |t          v rt          j        | |          S t          d|           )Nr;   r<   r=   rM   r5   rN   )r   r/   r?   r   r   r6   r   rO   s          r   !dequantize_per_tensor_tensor_metarX   V  s     M	"""]IYIYI[I[]] #"" ;;==ASEKKMMSS  ;%!J5!J!J'''Y7777OOOPPPr   zdequantize_per_tensor.tensor2(Tensor input, Tensor scale, Tensor zero_point, Tensor quant_min, Tensor quant_max, ScalarType dtype, *, ScalarType? out_dtype=None) -> Tensorzdequantize_per_tensor.tensor2c          	         |                                 dk    sJ d|                                              |                                 dk    sJ d|                                              t          | |                                |                                |                                |                                ||          S rT   rU   rO   s          r   dequantize_per_tensor_tensor2rZ   w  s    ( """]IYIYI[I[]] #"" ;;==ASEKKMMSS  !

   r   c          	      .    t          | ||||||          S )NrJ   )rX   rO   s          r   "dequantize_per_tensor_tensor2_metar\     s*     -uj)Y   r   zrchoose_qparams.tensor(Tensor input, int quant_min, int quant_max, float eps, ScalarType dtype) -> (Tensor, Tensor)zchoose_qparams.tensorqminqmaxepsc           
      z   | j         t          j        t          j        t          j        fv sJ d| j                      |t
          v s'J dt
                                           d|             t          ||           t          j        |           \  }}t          |||||t          j
        |g          d          S )[  Given an input Tensor, derive the per tensor affine quantization parameter
    (scale and zero_point) for target quantized Tensor from the Tensor

    Args:
       input (torch.Tensor): floating point input Tensor
       quant_min (int): minimum quantized value for target quantized Tensor
       quant_max (int): maximum quantized value for target quantized Tensor
       dtype (torch.dtype): dtype for target quantized Tensor

    Returns:
       scale (float): quantization parameter for the target quantized Tensor
       zero_point (int): quantization parameter for the target quantized Tensor
    CExpecting input to have dtype torch.float32/16/b16, but got dtype: $Expecting target dtype to be one of , but got: F)has_customized_qrange)r   r   r/   r,   r-   r   keysr   aminmaxr   Tensorr%   r]   r^   r_   r   min_valmax_vals          r   choose_qparams_tensorrl     s    " ;   
 	\ek[[   ++++a/F/K/K/M/MaaZ_aa ,++ tT"""}U++GWcU#   r   z|choose_qparams_symmetric.tensor(Tensor input, int quant_min, int quant_max, float eps, ScalarType dtype) -> (Tensor, Tensor)zchoose_qparams_symmetric.tensorc           
         | j         t          j        t          j        t          j        fv sJ d| j                      |t
          v s'J dt
                                           d|             t          ||           t          j        |           \  }}t          |||||t          j
        |g          dt          j                  S )ra   rb   rc   rd   F)re   qscheme)r   r   r/   r,   r-   r   rf   r   rg   r   rh   per_tensor_symmetricri   s          r   choose_qparams_symmetric_tensorrp     s    * ;   
 	\ek[[   ++++a/F/K/K/M/MaaZ_aa ,++ tT"""}U++GWcU#*	 	 	 	r   c                 >   | j         t          j        t          j        t          j        fv sJ d| j                      ||k     sJ d| d|             t          j        dt          j        | j                  t          j        dt          j        | j                  fS )Nrb   zKExpecting quant_min to be smaller than quant_max but received min:         z max: r;   r   device)	r   r   r/   r,   r-   emptydoublers   int64r%   r   r   r_   r   s        r   choose_qparams_tensor_metarx     s     ;   
 	\ek[[   y   	&		& 	&#	& 	& !   ;qU\BBBEK	U\E E E  r   c                     t          j        dt           j        | j                  t          j        dt           j        | j                  fS )Nr;   rr   )r   rt   ru   rs   rv   rw   s        r   $choose_qparams_symmetric_tensor_metarz   $  sG     ;qU\BBBEK	U\E E E  r   c                     t          t          |                                                     }d||<   ||d<   |                     t	          |                    }||fS )Nr   )listrangedimpermutetuple)xaxisnew_axis_listys       r   _permute_to_axis_zeror   .  sU    quuww((MM$M!			%&&''Amr   zquantize_per_channel(Tensor input, Tensor scales, Tensor zero_points, int axis, int quant_min, int quant_max, ScalarType dtype) -> Tensorquantize_per_channelscaleszero_pointsr   c                    | j         t          j        t          j        fv r|                     t          j                  } | j         t          j        k    sJ d| j                      ||                                 k     sJ d|                                              t          |||           t          | |          \  } }dg|                                 z  }|j	        d         |d<   |
                    |          }|
                    |          }t          j        t          j        | d|z  z            |z   ||          }	|	                    t          |                    }
|
                    |          S )at  Affine per channel quantization for the Tensor using the same quantization
    parameters for each channel/axis to map from floating point to quantized values

    Args:
       input (torch.Tensor): original float32 or bfloat16 Tensor
       scales (torch.Tensor): a list of scale quantization parameter for
       affine quantization, one per channel
       zero_point (torch.Tensor): a list of zero_point quantization parameter for
       affine quantization, one per channel
       quant_min (int): minimum quantized value for output Tensor
       quant_max (int): maximum quantized value for output Tensor
       dtype (torch.dtype): requested dtype (e.g. torch.uint8) for output Tensor

    Returns:
       Tensor with requested dtype (e.g. torch.uint8), note the quantization parameters
       are not stored in the Tensor, we are storing them in function arguments instead
    r*   Expecting axis to be < r;   r   r+   )r   r   r,   r-   r.   r/   r~   r"   r   shapeviewr0   r1   r   r   )r%   r   r   r   r   r   r   permute_axis_list	new_shaperesouts              r   r   r   <  sU   6 {u}en555'';%-'''Tu{TT ('' %))++FFF	9e<<<4UDAAEeiikk!I<?IaL[[##F""9--K
+ES6\*++k99i C ++e-..
/
/C66%==r   c                    | j         t          j        t          j        fv r|                     t          j                  } | j         t          j        k    sJ d| j                      ||                                 k     sJ d|                                              t          |||           t          j        | |          S )Nr*   r   r5   )	r   r   r,   r-   r.   r/   r~   r"   r6   )r%   r   r   r   r   r   r   s          r   quantize_per_channel_metar   l  s     {u}en555'';%-'''Tu{TT ('' %))++FFF	9e<<<E////r   zdequantize_per_channel(Tensor input, Tensor scales, Tensor? zero_points, int axis, int quant_min, int quant_max, ScalarType dtype, *, ScalarType? out_dtype=None) -> Tensordequantize_per_channelc                F   | j         |k    sJ d| d| j                      |t          j        }||                                 k     sJ d|                                              t	          |||           t          | |          \  } }dg|                                 z  }	|j        d         |	d<   |                    |	          }|| |                    |	          z
  |z  }
n| |z  }
|
                    |          }
|
	                    t          |                    }|S )a  Affine per channel dequantization for the Tensor using the same quantization
    parameters for each channel/axis to map from quantized values to floating point values

    Args:
       input (torch.Tensor): Tensor with dtype matching `dtype` argument,
       e.g. (`torch.uint8`), it is a per channel quantized Tensor if combined with
       quantization parameter in the argument of this function (scales/zero_points/axis)

       scales (torch.Tensor): a list of scale quantization parameter for
       affine quantization, one per channel

       zero_points (torch.Tensor): a list of zero_point quantization parameter for
       affine quantization, one per channel

       quant_min (int): minimum quantized value for output Tensor (not used in computation,
       reserved for pattern matching)

       quant_max (int): maximum quantized value for output Tensor (not used in computation,
       reserved for pattern matching)

       dtype (torch.dtype): requested dtype for output Tensor (not used in computation,
       reserved for pattern matching)

       out_dtype (torch.dtype?): optional dtype for output Tensor

    Returns:
       dequantized float32 Tensor
    Expecting input to have dtype , but got dtype: Nr   r;   r   )r   r   r/   r~   r"   r   r   r   r.   r   r   )r%   r   r   r   r   r   r   rK   r   r   r   r   s               r   r   r     s/   P ;%NNNNN   M	%))++FFF	9e<<<4UDAAEeiikk!I<?IaL[[##F{''	222f<fn
&&

C
++e-..
/
/CJr   c                   | j         |k    sJ d| d| j                      |t          j        }||                                 k     sJ d|                                              t	          |||           t          j        | |          S )Nr   r   r   r5   )r   r   r/   r~   r"   r6   )r%   r   r   r   r   r   r   rK   s           r   dequantize_per_channel_metar     s     ;%NNNNN   M	%))++FFF	9e<<<E3333r   zLchoose_qparams_per_token(Tensor input, ScalarType dtype) -> (Tensor, Tensor)choose_qparams_per_tokenc                    |                                                      dd          }|j        t          j        k    r|                                }|t          j        k    rd}d|dz
  z  dz
  }nt          d|           |                    d	          	                    |          }t          j
        |          }||fS )
  Choose quantization parameters for per token quantization. This means for a N dimension Tensor
    (M1, M2, ...Mn, N), we calculate scales/zero_points for each N elements and quantize
    every N elements with the same quantization parameter. The dimension for scales/zero_points
    will be (M1 * M2 ... * Mn)

    Args:
       input (torch.Tensor): original float32/float16 Tensor
       dtype (torch.dtype): dtype (e.g. torch.uint8) for input Tensor

    Returns:
        scales and zero_points, both float32 Tensors
    Tr~   keepdim      r;   z/unsupported dtype in choose_qparams_per_token: gh㈵>r   )absamaxr   r   r,   floatint8	Exceptionr0   div
zeros_like)r%   r   r   n_bitsr   r   s         r   r   r     s    , YY[["d33F|u}$$LLNN 	 
&1*%)		EeEE
 
 	
 \\d\##''	22F"6**K;r   c                     t          | j        d d                   dgz   }t          j        |t          j        | j                  t          j        |t          j        | j                  fS Nr   r;   rr   r|   r   r   rt   ru   rs   rv   r%   r   sizes      r   choose_qparams_per_token_metar     f     CRC !!QC'D;t5<EEEu{EKH H H  r   z]_choose_qparams_per_token_asymmetric_impl(Tensor input, ScalarType dtype) -> (Tensor, Tensor))_choose_qparams_per_token_asymmetric_implCompositeImplicitAutogradc                    d\  }}t          j        | dd          }t          j        | dd          }t          j        |t          j        |                    }t          j        |t          j        |                    }t          j        t           j                  j        }||z
  t          ||z
            z  }	|	
                    |          }	||	z  }
||	z  }||
z   }||z   }t          j        ||z   dk    ||
z
  ||z
            }t          j
        |||                                          }|	                    t           j                  |                    t           j                  fS )r   )i   r   Tr   r   r   )r   aminr   r   r   r   r   r/   r_   r   r0   wherer1   r.   float64rv   )r%   r   r]   r^   rj   rk   min_val_negmax_val_posr_   r&   descaled_mindescaled_maxzero_point_from_min_errorzero_point_from_max_errorr'   s                  r   r   r      sX   , JD$jB555GjB555G)GU%5g%>%>??K)GU%5g%>%>??K
+em
$
$
(C ;&%t*<*<<EKKCK  E &L&L $| 3 $| 3!$==A|| J
 Zt44::<<J88EM""JMM%+$>$>>>r   zWchoose_qparams_per_token_asymmetric(Tensor input, ScalarType dtype) -> (Tensor, Tensor)#choose_qparams_per_token_asymmetricc                 "    t          | |          S rG   )r   r%   r   s     r   r   r   U  s     5UEBBBr   c                     t          | j        d d                   dgz   }t          j        |t          j        | j                  t          j        |t          j        | j                  fS r   r   r   s      r   (choose_qparams_per_token_asymmetric_metar   a  r   r   c                 d   t          j        t          |                                           d d                   }||                                k    s"J d| d|                                             ||                                k    s"J d| d|                                             d S )Nr   znum_tokens: z	 scales: z zero_points: )mathprodr|   r   r?   )r%   r   r   
num_tokenss       r   !_per_token_quant_qparam_dim_checkr   p  s    4

--crc233J''';z;;FKKMM;; ('' **,,,,,EzEE1A1A1C1CEE -,,,,r   z}quantize_per_token(Tensor input, Tensor scales, Tensor zero_points, int quant_min, int quant_max, ScalarType dtype) -> Tensorquantize_per_tokenc                    t          |||           t          | ||           |                     d|z                                |                                                              ||                              |          } | S )a  Per token quantization for the Tensor using the quantization parameters to map
    from floating point to quantized values. This means for a N dimension Tensor
    (M1, M2, ...Mn, N), we calculate scales/zero_points for each N elements and quantize
    every N elements with the same quantization parameter. The dimension for scales/zero_points
    will be (M1 * M2 ... * Mn)

    Args:
       input (torch.Tensor): original float32 or bfloat16 Tensor
       scales (float32 torch.Tensor): quantization parameter for per token affine quantization
       zero_points (int32 torch.Tensor): quantization parameter for per token affine quantization
       quant_min (int): minimum quantized value for output Tensor
       quant_max (int): maximum quantized value for output Tensor
       dtype (torch.dtype): requested dtype (e.g. torch.uint8) for output Tensor

    Returns:
       Tensor with requested dtype (e.g. torch.uint8), note the quantization parameters
       are not stored in the Tensor, we are storing them in function arguments instead
    r+   )r"   r   muladdr1   r0   r.   r%   r   r   r   r   r   s         r   r   r     sw    6  	9e<<<%eV[AAA		#,	[				y)	$	$	E 
 Lr   c                 P    t          |||           t          j        | |          S rQ   r"   r   r6   r   s         r   quantize_per_token_metar     s,      	9e<<<E////r   zdequantize_per_token(Tensor input, Tensor scales, Tensor zero_points, int quant_min, int quant_max, ScalarType dtype, ScalarType output_dtype) -> Tensordequantize_per_tokenoutput_dtypec                 @    | |z
  } | |z  } |                      |          S )a  Per token dequantization for the Tensor using the quantization parameters to map
    from floating point to quantized values. This means for a N dimension Tensor
    (M1, M2, ...Mn, N), we calculate scales/zero_points for each N elements and quantize
    every N elements with the same quantization parameter. The dimension for scales/zero_points
    will be (M1 * M2 ... * Mn)

    Args:
       input (torch.Tensor): quantized Tensor (uint8, int8 etc.)
       scales (float64 torch.Tensor): quantization parameter for per token affine quantization
       zero_points (int64 torch.Tensor): quantization parameter for per token affine quantization
       quant_min (int): minimum quantized value for input Tensor
       quant_max (int): maximum quantized value for input Tensor
       dtype (torch.dtype): dtype (e.g. torch.uint8) for input Tensor
       output_dtype (torch.dtype): dtype (e.g. torch.float32) for output Tensor

    Returns:
       dequantized Tensor with dtype `output_dtype`
    )r.   r%   r   r   r   r   r   r   s          r   r   r     s)    8 KEFNE88L!!!r   c                 P    t          |||           t          j        | |          S rQ   r   r   s          r   dequantize_per_token_metar     s,      	9e<<<E6666r   zquantize_per_channel_group(Tensor input, Tensor scales, Tensor zero_points, int quant_min, int quant_max, ScalarType dtype, int group_size) -> Tensorquantize_per_channel_group   c                    |dk    sJ || j         d         k    r|j         d         dk    r| j         d         }| j         d         |z  dk    sJ |                                 dk    sJ |                     d|          }t          j        |                                          dk    sJ |                    dd          }|                    dd          }|                    d|z                                |                                          	                    ||          
                    |                              |           }|S )Nr;   r   r   r   r+   )r   r~   reshaper   isnansumr   r   r1   clamp_r.   
reshape_as)	r%   r   r   r   r   r   
group_sizeto_quant
input_int8s	            r   r   r     s6    >>>>EKO##R(8A(=(=[_
;r?Z'1,,,,99;;! }}R,,H;x  $$&&!++++^^B""F%%b!,,K 	S6\""	[					9	%	%	E	E		  r   c                     |dk    sJ || j         d         k    r|j         d         dk    r| j         d         }| j         d         |z  dk    sJ |                                 dk    sJ t          j        | |          S )aX  Groupwise quantization within each channel for an 2-d Tensor using the quantization parameters
    to map from floating point to quantized values. This means for each row of a 2-d Tensor
    (M, N), we calculate scales/zero_points for each `group_size` elements
    and quantize every `group_size` elements with the same quantization parameter.
    The dimension for scales/zero_points will be (M * ceil(N, group_size),)

    Args:
       input (torch.Tensor): original float32 or bfloat16 Tensor
       scales (float32 torch.Tensor): quantization parameter for per channel group affine quantization
       zero_points (int32 torch.Tensor): quantization parameter for per channel group affine quantization
       quant_min (int): minimum quantized value for output Tensor
       quant_max (int): maximum quantized value for output Tensor
       dtype (torch.dtype): requested dtype (e.g. torch.uint8) for output Tensor

    Returns:
       Tensor with requested dtype (e.g. torch.uint8), note the quantization parameters
       are not stored in the Tensor, we are storing them in function arguments instead
    r;   r   r   r   r5   )r   r~   r   r6   )r%   r   r   r   r   r   r   s          r   quantize_per_channel_group_metar     s    8 >>>>EKO##R(8A(=(=[_
;r?Z'1,,,,99;;!E////r   zdequantize_per_channel_group(Tensor input, Tensor scales, Tensor? zero_points, int quant_min, int quant_max, ScalarType dtype, int group_size, ScalarType output_dtype) -> Tensordequantize_per_channel_groupw_int8r   c                 F   |dk    sJ || j         d         k    r|j         d         dk    r| j         d         }| j         d         |z  dk    sJ |                                 dk    sJ |                     d|          }|                    dd          }||                    dd          }	n&t          j        g t          j        |j                  }	|                    |	                              |          	                    |           
                    |          }
|
S )a!  Groupwise dequantization within each channel for an 2-d Tensor using the quantization parameters
    to map from floating point to quantized values. This means for each row of a 2-d Tensor
    (M, N), we calculate scales/zero_points for each `group_size` elements
    and quantize every `group_size` elements with the same quantization parameter.
    The dimension for scales/zero_points will be (M * ceil(N, group_size),)

    Args:
       input (torch.Tensor): quantized Tensor (uint8/int8 etc.)
       scales (float32 torch.Tensor): quantization parameter for per channel group affine quantization
       zero_points (int32 torch.Tensor): quantization parameter for per channel group affine quantization
       quant_min (int): minimum quantized value for input Tensor
       quant_max (int): maximum quantized value for input Tensor
       dtype (torch.dtype): dtype (e.g. torch.uint8) for input Tensor
       output_dtype (torch.dtype): dtype (e.g. torch.float32) for output Tensor

    Returns:
       dequantized Tensor with dtype `output_dtype`
    r;   r   r   r   Nrr   )r   r~   r   r   zerosint32rs   subr   r   r.   )r   r   r   r   r   r   r   r   w_int8_groupedzpw_dqs              r   r   r   E  s   D >>>>FL$$$b)9Q)>)>\"%
<j(A----::<<1^^B
33N^^B""F  Q''[5;v}EEEb!!%%f--88@@CCLQQDKr   zyfake_quant_per_channel(Tensor input, Tensor scales, Tensor zero_points, int axis, int quant_min, int quant_max) -> Tensorc                   :    e Zd Zed             Zed             ZdS )FakeQuantPerChannelc                 .   |j         t          j        k    r|                    t          j                  }|j         t          j        k    r|                    t          j                  }|j         t          j        k    sJ d|j                      ||                                k     sJ d|                                             t          t          d|                    t          t          |dz   |j                            z   }t          ||          }t          ||          }	t          j
        |d|z  z            |	z   }
t          j        |
||          |	z
  |z  }t          j        |
|k    |
|k              }|                     |           |S )Nr*   r   r   r;   r+   )r   r   r/   r.   r   r~   r|   r}   ndimr   r1   r0   logical_andsave_for_backward)ctxr%   r   r   r   r   r   broadcast_dimsunsqueeze_scalesunsqueeze_zero_pointstempr   masks                r   forwardzFakeQuantPerChannel.forward  so   <5=((YYu}--F++%..55K{em+++X5;XX ,++ eiikk!!!#JUYY[[#J#J!!!eAtnn--U4!8UZ5P5P0Q0QQ.v~FF 3K P P{5C*:$:;<<?TTKi336KK  $)"3ty7HJJd###
r   c                 *    | j         \  }||z  d d d d d fS rG   )saved_tensors)r   gyr   s      r   backwardzFakeQuantPerChannel.backward  s#    #Dy$dD$66r   N)__name__
__module____qualname__staticmethodr   r   r   r   r   r   r   ~  sH          \* 7 7 \7 7 7r   r   fake_quant_per_channelAutogradc                 @    t                               | |||||          S rG   )r   applyr%   r   r   r   r   r   s         r   r  r    s)     $$v{D)Y  r   c                 *    t          j        |           S rG   r   r6   r  s         r   fake_quant_per_channel_metar    s     E"""r   zFconvert_element_type.no_fuse(Tensor input, ScalarType dtype) -> Tensorzconvert_element_type.no_fusec                 V    t           j        j        j                            | |          S rG   )r   opsprimsconvert_element_typedefaultr   s     r   r  r    s!     9?/77uEEEr   c                 .    t          j        | |          S rQ   r  r   s     r   convert_element_type_metar    s    E////r   )r   )Ir   typingr   r   torch._refsr   torch.ao.quantization.utilsr   r   torch.libraryr   r   quantized_decomposed_libuint8r   uint16int16r   _INTEGER_DTYPESfloat8_e5m2float8_e4m3fn_FLOAT_DTYPESr   updater"   definerh   r   r   r   r#   r8   rA   rC   rE   rH   rI   rR   rV   rX   rZ   r\   r   rl   rp   rx   rz   r   r   r   r   r   r   r   r   r   r   r   r   r   r/   r   r   r   r   r   autogradFunctionr   r  r  r  r  r   r   r   <module>r      s           + + + + + + M M M M M M M M ' ' ' ' ' ' ' '
 #7#95AA ;
EL%+u{S"E$78 9H      RRMRRR        @    57RSS!<!! ! 	!
 ! ;! \! ! ! TS!H  5v>>0<00 0 	0
 0 ;0 \0 0 0 ?>0    @   :<W <<  	
  ; \   :  <fEE0<0<0 0 	0
 0 ;0 \0 0 0 FE0.   F   ;=X <<  |	
 | ; \   :  =vFF<<  |	
 | ; \   GF,   _    79TUU (,/Q /Q /Q</Q/Q /Q 	/Q
 /Q ;/Q $/Q \/Q /Q /Q VU/Qd  7@@ (,4 4 4<4<4 4 	4
 4 ;4 $4 \4 4 4 A@4   _   "  (,  <<  	
  ; $ \   
@  >GG (,Q Q Q<Q<Q Q 	Q
 Q ;Q $Q \Q Q Q HGQ4   e   #  (,  <<  |	
 | ; $ \   
@  ?HH (,   $ \   IH   7    79TUU&<&"&*-&49&BG+&
5<%&& & & VU&R   7   % 
&<&"&*-&49&BG+&
5<%&& & & 
&R  7@@<$'47>CLQK
5<%&   A@&  A6JJ<$'47>CLQK
5<%&   KJ     @    68STT,<,L, , 	,
 , , ;, \, , , UT,^  6??0<0L0 0 	0
 0 0 ;0 \0 0 0 @?0.   _    8:UVV (,; ; ;<;L; %,'; 	;
 ; ; ;; $; \; ; ; WV;|  8&AA (,4 4 4<4L4 %,'4 	4
 4 4 ;4 $4 \4 4 4 BA4*   R  
  
 < ;  5<%&      
 F 
 
<; 5<%&   
   c  
 / 
(?<(?;(? 5<%&(? (? (? 
(?V   ]  
 ) 
C<C;C 5<%&C C C 
C )
 
<; 5<%&   
     @    46QRR#<#L# # 	#
 # ;# # # SR#L  4f==	0<	0L	0 	0 		0
 	0 ;	0 	0 	0 >=	0   Y    68STT !&" "<"L" " 	"
 " ;" +" " " UT"B  6?? !&7 7<7L7 7 	7
 7 ;7 +7 7 7 @?7   A   :<W  ! !<!L! ! 	!
 ! ;! ! ! !H  <fEE "0 "0<"0L"0 "0 	"0
 "0 ;"0 "0 "0 FE"0J   Z   "   %+ +L+L+ %,'+ 	+
 + ;+ + ++ + + 
+\   .  7 7 7 7 7%.1 7 7 7:  8*EE
<
L
 
 	

 
 
 \
 
 
 FE
  8&AA#<#L# # 	#
 # # \# # # BA#   L  
 " 
F FU[ FU\ F F F 
F  >GG0U\ 0%+ 0%, 0 0 0 HG0 0 0r   