
    Pi                       d Z ddlZddlZddlZddlZddlmZ ddlmZm	Z	 ddl
mZ ddlmZmZmZmZmZmZ ddlmZ ddlZddlmZ ddlmc mc mZ ddlZddlmZ ddlmZmZm Z m!Z!m"Z"m#Z#m$Z$m%Z%m&Z&m'Z'm(Z(m)Z)m*Z*m+Z+m,Z,m-Z-m.Z.m/Z/m0Z0 dd	l1m2Z2m3Z3 dd
l4m5Z5 ddl6m7Z7m8Z8 ddl9m:Z: ddl;m<Z<m=Z=m>Z>m?Z?m@Z@ ddlAmBZB ddlCmDZD ddlEmFZF ddlGmHZHmIZImJZJmKZKmLZLmMZMmNZNmOZOmPZPmQZQmRZRmSZSmTZTmUZU ddlVmWZWmXZX ddlYmZZZ ddl[m\Z\ ddl]m^Z^m_Z_m`Z`maZa ddlbmcZcmbZb ddldmeZe ddlfmgZgmhZhmiZimjZjmkZk ddllmmZmmnZn ddlompZpmqZq ddlrmsZs ddltmuZumvZvmwZwmxZx ddlymzZzm{Z{ ddlm|Z|  ej}        e~          Zg d Ze+ewj        ge&ewj        ge"ewj        ge#ewj        ewj        giZe+d!e&d"e"d!e#d!iZ	 	 	 dd%eeed&f                  d'dfd(Zd) Zdd*Zdd+Zdd,d-ej        d.eeD         d/eeD         d0eeej        j        egef                  fd1Zd2ej        fd3Zd4 Zd5 Zd6 Zd!d!d7d8Zedfd-ej        j        d9ed0eeej        j        egef                  d:eej        j                 fd;Zd<ej        d'ej        fd=Zd<ej        d'ej        fd>Zd<ej        d'ej        fd?Ze G d@ dAe                      Z e^dBe          Z eXe          dddCdDej        j        d9edEeej                 dFeej                 fdG            Ze G dH dIe                      ZdddCdEeej                 dFeej                 fdJZ eXe          dddCdDej        j        d9edEeej                 dFeej                 d'ej        j        f
dK            Ze G dL dMe                      Z e^dNe          Z eXe          dDej        j        d9ed'ej        j        fdO            Ze G dP dQe                      Z e^dRe          Z eXe          dDej        j        d9efdS            Ze G dT dUe                      Z e^dVe          ZdW Z eXe          dDej        j        d9ed'ej        j        fdX            Ze G dY dZe                      Z eXe          dDej        j        d9ed'ej        j        fd[            Ze G d\ d]e                      Z e^d^e          Zd_ Z eXe          d2d`dDej        j        d9edaefdb            Zd<ej        d'ej        fdcZd<ej        d'ej        fddZd<ej        d'ej        fdeZd<ej        d'ej        fdfZd<ej        dgej        d'ej        fdhZd<ej        dgej        d'ej        ej        ffdiZe G dj dke                      Z e^dle          Zdm Z eXe          dDej        j        d9ed'ej        j        fdn            Zdo Ze G dp dqe                      Z e^dre          Zds Z eXe          d2d`dDej        j        d9edaed'ej        j        fdt            Z	 	 dd<ej        due=dvej        dweej                 dxeej                 f
dyZd2ej        d'efdzZe G d{ d|e                      Z e^d}e          Zd~ Z eXe          d2d`dDej        j        d9edaefd            Ze G d de                      Z eXeĦ          dDej        j        d9efd            Ze G d de                      Z e^deƦ          Z eXeƦ          dDej        j        d9efd            Ze G d de                      Z e^deɦ          Z eXeɦ          dDej        j        d9efd            ZdeTdej        dej        d'dfdZe G d de                      ZdddCdEeej                 dFeej                 fdZ eXeͦ          dddCdDej        j        d9edEeej                 dFeej                 d'ej        j        f
d            Ze G d de                      Z e^deЦ          Z eXeЦ          dDej        j        d9ed'ej        j        fd            Ze G d de                      ZeZeeehZdDej        j        ded9efdZded9efdZdDej        ded9efdZdDe:d'ej        fdZej                            eeeeeeee2g           dS )ap  
Quantization APIs

Generally these APIs can be applied directly to any model
with Linear modules to obtain quantized linear ops. The intended
usage involves applying torch.compile to the model afterwards
both because primitives were designed based on the fusions that
come along with it and because that is how we access the intended quantized
and mixed GEMM kernels
    N)OrderedDict)	dataclassfield)partial)AnyCallableListOptionalTupleUnion)AOBaseConfig)AffineQuantizedTensorCutlassInt4PackedLayoutCutlassSemiSparseLayoutFloat8LayoutInt4CPULayoutInt4XPULayout!Int8DynamicActInt4WeightCPULayoutMarlinQQQLayoutMarlinSparseLayout1PackedLinearInt8DynamicActivationIntxWeightLayoutPlainLayout	QDQLayoutSemiSparseLayoutTensorCoreTiledLayoutUintxLayoutto_affine_quantized_floatx!to_affine_quantized_floatx_staticto_affine_quantized_intxto_marlinqqq_quantized_intx)Target=make_packed_linear_int8_dynamic_activation_intx_weight_tensor)Layout)
e4m3_dtype
e5m2_dtype)Float8Linear)Float8MMConfigFP8Granularity_check_hardware_support!_granularity_is_a_1_128_w_128_128_normalize_granularity)$LinearActivationWeightObservedTensor)AffineQuantizedObserverBase)KernelPreference)Float8TensorInt4ChooseQParamsAlgorithmInt4MarlinSparseTensorInt4PackingFormatInt4PlainInt32TensorInt4PreshuffledTensor
Int4TensorInt4TilePackedTo4dTensor
Int8TensorIntxChooseQParamsAlgorithmIntxOpaqueTensorIntxPackingFormatIntxUnpackedToInt8TensorQuantizeTensorToFloat8Kwargs)_QUANTIZE_CONFIG_HANDLER register_quantize_module_handler)get_block_size)=to_weight_tensor_with_linear_activation_quantization_metadata)_ConfigDeprecationWrapperis_MI300is_sm_at_least_89is_sm_at_least_90   )AutoQuantizableLinearWeight	autoquant)Int4WeightOnlyGPTQQuantizer)GranularityPerAxisPerGroupPerRow	PerTensor)LinearActivationQuantizedTensorto_linear_activation_quantized)Int4WeightOnlyQuantizerInt8DynActInt4WeightQuantizer) intx_quantization_aware_training)_DTYPE_TO_QVALUE_BOUNDSMappingTypeZeroPointDomainquantize_affine)	QuantizerTwoStepQuantizer)_get_per_token_block_size)swap_conv2d_1x1_to_linearrW   rX   rH   rP   rG   _get_subclass_inserter	quantize_#int8_dynamic_activation_int4_weight#int8_dynamic_activation_int8_weight/int8_dynamic_activation_int8_semi_sparse_weightint4_weight_onlyint8_weight_onlyrR   float8_weight_onlyuintx_weight_onlyfpx_weight_onlygemlite_uintx_weight_only'float8_dynamic_activation_float8_weight&float8_static_activation_float8_weightrQ   3Float8DynamicActivationFloat8SemiSparseWeightConfigModuleFqnToConfigFT  
extra_args.returnc           	      V    || |dd                   r#||                      |            || g|R  } | S t          |                                           }|D ]5\  }}t          |||| | d||          }	|	|ur|	t	          | ||	           6||                      |           | S )a'  
    Recursively replaces each child module in `model` with the result of `replacement_fn(child)`
    if `filter_fn(child)` returns `True`.

    Args:
        model (torch.nn.Module): The model containing modules to be replaced.
        replacement_fn (Callable[[torch.nn.Module], torch.nn.Module]): The function to replace matching modules.
        filter_fn (Callable[[torch.nn.Module], bool]): The filter function to determine which modules to replace.
        cur_fqn (str, optional): The current fully qualified name of the module being processed. Defaults to "".
        device (device, optional): Device to move the model to before applying `filter_fn`. Defaults to None.
        extra_args (Tuple[Any, ...], optional): optional extra args to pass to `replacement_fn`.

    Returns:
        None
    Ndevice.)tolistnamed_children)_replace_with_custom_fn_if_matches_filtersetattr)
modelreplacement_fn	filter_fncur_fqnrq   rl   named_children_listnamechild	new_childs
             r/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/torchao/quantization/quant_api.pyrv   rv      s    . y%% HHFH###u2z222"5#7#7#9#9::. 
	0 
	0KD%A#D### I %%)*?tY///HHFH###    c                    ddl m} t          | t          j        j                  ot          | d          ot          | j        t                     opt          | j        t                     oUt          | j        t                     o:t          | j        |           o$t          | t          j        j        j                   S )Nr   )_AffineFakeQuantizedTensorweight)5torchao.quantization.qat.affine_fake_quantized_tensorr   
isinstancetorchnnLinearhasattrr   rF   r   rN   moduleslinearNonDynamicallyQuantizableLinear)modargsr   s      r   
_is_linearr      s          	3(( 	SC""	S3:'BCCC	S 3:'<===	S 3:'FGGG		S
 3:'ABBB	S 3
 1 QRRRr   c                 z                          dd                               dd           fd}|S )a  
    Returns a function which inserts the given subclass into all linear modules
    in the model. The inserted module will have its weight set to the result of
    `cls(mod.weight, **kwargs)`. If parametrization is enabled then this will be done using
    torch.nn.utils.parametrize instead of directly setting the attribute on the module.

    Args:
        cls (torch.Tensor): The class to insert as a child module.
        kwargs (Any): Any additional arguments for the constructor.
    constructorsubclass_constructormethod
from_floatc           	         r{t           j                             j        | j        fi d          | _        | j                                        \  }}t          j        | d t                    |            n@t           j                             t                    | j        fi d          | _        | S )NFrequires_gradr   )	r   r   	Parameterr   r   __tensor_flatten__parametrizeregister_parametrizationgetattr)lin_r   clsr   enable_parametrizationr   kwargss      r   insert_subclassz/_get_subclass_inserter.<locals>.insert_subclass  s    ! 	++sz44V44E ,  CJ j3355GAt0X8wsK88$?    ++(Z((>>v>># ,  CJ
 
r   )pop)r   r   r   r   r   r   s   ``` @@r   r[   r[      se     **],BCCKHl33J        " r   c                 z     G d dt           j        j                  fd}|d }t          | ||           dS )zi
    Changes all conv2d 1x1 modules to equivalent linear modules so that they can then be quantized.
    c                   $     e Zd Z fdZd Z xZS )2swap_conv2d_1x1_to_linear.<locals>.PermuteSandwichc                 V    t                                                       || _        d S N)super__init__r   )selfr   	__class__s     r   r   z;swap_conv2d_1x1_to_linear.<locals>.PermuteSandwich.__init__  s$    GGDHHHr   c                     |                      |d                             dddd                                        dddd          S )Nr         rE   )r   permute)r   r   s     r   forwardz:swap_conv2d_1x1_to_linear.<locals>.PermuteSandwich.forward#  s>    88DGOOAq!Q7788@@Q1MMMr   )__name__
__module____qualname__r   r   __classcell__)r   s   @r   PermuteSandwichr     sN        	 	 	 	 		N 	N 	N 	N 	N 	N 	Nr   r   c                 ,   | j         dk    sJ t          j                            | j        | j        | j        d u           }t          j                            | j        	                    dd                    |_        | j        |_         |          S )NrE   rE   )biasro   )
kernel_sizer   r   r   in_channelsout_channelsr   r   r   squeeze)convr   r   s     r   replace_conv2d_1x1z5swap_conv2d_1x1_to_linear.<locals>.replace_conv2d_1x1&  s    6))))hood/tyD7H  
 
 X''(;(;B(C(CDD
9s###r   Nc                 V    t          | t          j        j                  o
| j        dk    S )Nr   )r   r   r   Conv2dr   )r   r   s     r   <lambda>z+swap_conv2d_1x1_to_linear.<locals>.<lambda>0  s+    z(
 (
 ((o' r   rz   )r   r   Modulerv   )rx   rz   r   r   s      @r   rZ   rZ     s    
N N N N N%(/ N N N$ $ $ $ $ ( (	 .!Y     r   r   rx   input_observerweight_observerrz   c                d    dt           j        ffd}t          | ||t          n|           dS )a  
    Converts the weight of a linear module to a LinearActivationWeightObservedTensor.

    This function wraps the weight of the given linear module with a LinearActivationWeightObservedTensor,
    which enables observation of both input and weight tensors during forward passes.
    The wrapped weight is then re-wrapped as a nn.Parameter to maintain compatibility
    with PyTorch's module system.

    Example::

    ```
        import torch
        import torch.nn as nn
        from torchao.quantization import PerTensor
        from torchao.quantization.linear_observer_tensor import insert_observers_
        from torchao.quantization.observer import (
            AffineQuantizedMinMaxObserver,
            MappingType
        )

        # Create observers
        input_observer = AffineQuantizedMinMaxObserver(
            MappingType.SYMMETRIC,
            torch.float8_e4m3fn,
            granularity_type=PerTensor(),
            eps=torch.finfo(torch.float32).eps,
            scale_dtype=torch.float,
            zero_point_dtype=torch.int,
            zero_point_domain=ZeroPointDomain.NONE,
        )

        # Create a linear module
        linear_module = nn.Linear(10, 20)

        # Convert the linear module's weight to an observed tensor
        insert_observers_(linear_module, input_observer, weight_observer=None)

        # The linear_module can now be used as usual, with observers calculating statistics
        output = linear_module(torch.randn(10, 10))

        # Get the scale and zero point of the input observer
        scale, zero_point = linear_module.weight.input_observer.calculate_qparams()
    ```

    Args:
        model (nn.Module): The nn.Module to convert.
        input_observer (Optional[AffineQuantizedObserverBase]): Observer for input tensor.
        weight_observer (Optional[AffineQuantizedObserverBase]): Observer for weight tensor.
        filter_fn (Optional[Callable[[torch.nn.Module, str], bool]]): Filter function to select which modules to convert.
            If not provided, all linear modules will be converted. This function should take a module and its fully qualified name.

    Returns:
        nn.Linear: The modified linear module with its weight wrapped in a LinearActivationWeightObservedTensor.
    linear_modulec                     t          j        t          j        | j                  | j        j                  | _        | S )N)r   r   r   )r   r   r,   r   r   r   )r   r   r   s    r   convert_to_linear_observerz5insert_observers_.<locals>.convert_to_linear_observerw  sP    !|0;$- /  
 (.< 
  
  
 r   N)r   r   rv   r   )rx   r   r   rz   r   s    ``  r   insert_observers_r   9  sa    |
") 
 
 
 
 
 
 
 ."'

Y    r   r   c                    t          | t                    r$| j        j         d|                                  dS t          | t
                    r,| j        j         d| j         dt	          | j                   dS t          | d          r$| j        j         d|                                  dS t          |           t          j        u st          | t          j        j                  rdt          |            S dt          |            S )N()z(activation=	, weight=_quantization_typezTensor: znot recognized: )r   r   r   r   r   rN   input_quant_funcoriginal_weight_tensorr   typer   Tensorr   r   )r   s    r   r   r     s8   &/00 M"+LLf.G.G.I.ILLLL&9:: P"+  P  P9P  P  P[mnt  oL  \M  \M  P  P  P  	Pv+,, M"+LLf.G.G.I.ILLLLF||u|##z&%(:L'M'M#($v,,(((,d6ll,,,r   c                 |    d| j         j        d          d| j         j        d          dt          | j                    S )Nzin_features=rE   z, out_features=r   r   r   shaper   r   s    r   _linear_extra_reprr     sB    $++A.t{?PQR?S^pquq|^}^}r   c                 |    d| j         j        d          d| j         j        d          dt          | j                    S )Nznum_embeddings=r   z, embedding_dim=rE   r   r   r   s    r   _embedding_extra_reprr     sf     DT[.q1  D  D4;CTUVCW  D  Dbtuy  vA  cB  cB  D  D  Dr   c           
          g } |            }t          |          dk    r|                    |           |                    | dt          t          | |                                d                    |          S )Nr   =z, )lenappendr   r   join)r   original_extra_reprparameter_namemodule_torchao_extra_reproriginal_extra_repr_strs        r   _module_extra_reprr     s     "1133
"##a''!(()@AAA$$OO.wt^/L/LMMOO   99.///r   )allow_requires_gradpropagate_biasc                      fd}|S )zHelper function to apply the constructor that quantizes the weight Tensor (with additional kwargs)
    to the weight of linear module
    c                     o| j         j        }dk    r
| j        d<   t          j                             | j         fi |          | _         t          j        t          |           | _	        | S )NTr   r   )
r   r   r   r   r   r   types
MethodTyper   
extra_repr)r   r   r   r   r   r   s     r   r   z6_get_linear_subclass_inserter.<locals>.insert_subclass  s{    +H
0HT!! XF6NX''K
--f--] ( 
 

 )*<cBB
r   rk   )r   r   r   r   r   s   ```` r   _get_linear_subclass_inserterr     s6            r   configrq   c                 H   t           j                            d           t          |t                    r|t          d          |                                 D ]g\  }}t          ||          s)t          |||          sd|j	        v r8t          |          r)t          |||           ||                    |           hdS t          |t                    r<|t          n|}t          t          |                   }t!          | ||||f           dS t#          d          )a  Convert the weight of linear modules in the model with `config`, model is modified inplace

    Args:
        model (torch.nn.Module): input model
        config (AOBaseConfig): a workflow configuration object.
        filter_fn (Optional[Callable[[torch.nn.Module, str], bool]]): function that takes a nn.Module instance and fully qualified name of the module, returns True if we want to run `config` on
        the weight of the module
        device (device, optional): Device to move module to before applying `filter_fn`. This can be set to `"cuda"` to speed up quantization. The final model will be on the specified `device`.
            Defaults to None (do not change device).

    Example::

        import torch
        import torch.nn as nn
        from torchao import quantize_

        # quantize with some predefined `config` method that corresponds to
        # optimized execution paths or kernels (e.g. int4 tinygemm kernel)
        # also customizable with arguments
        # currently options are
        # Int8DynamicActivationInt4WeightConfig (for executorch)
        # Int8DynamicActivationInt8WeightConfig (optimized with int8 mm op and torch.compile)
        # Int4WeightOnlyConfig (optimized with int4 tinygemm kernel and torch.compile)
        # Int8WeightOnlyConfig (optimized with int8 mm op and torch.compile
        from torchao.quantization.quant_api import int4_weight_only

        m = nn.Sequential(nn.Linear(32, 1024), nn.Linear(1024, 32))
        quantize_(m, Int4WeightOnlyConfig(group_size=32, version=1))

    ztorchao.quantization.quantize_NzuCustom filter_fn and FqnToConfig were both specified. Only filter_fn=None is supported when FqnToConfig is specified._defaultrp   )rq   rl   zPassing a generic Callable to `quantize_` is no longer recommended and will be deprecated at a later release. Please see https://github.com/pytorch/ao/issues/1690 for instructions on how to pass in workflow configuration instead.)r   _C_log_api_usage_oncer   FqnToConfig
ValueErrornamed_modulesfqn_matches_fqn_config _module_param_matches_fqn_configfqn_to_configr   _fqn_to_config_handlerrs   r   r=   r   rv   AssertionError)rx   r   rz   rq   
module_fqnmodulehandlers          r   r\   r\     sa   H 
H  !ABBB&+&&   H   #("5"5"7"7 		- 		-J&z6::-3FJOO- &"666:f;M;M6 'vz6BBB%IIVI,,,&,'' 
"+"3JJ	*4<<81y	
 	
 	
 	
 	
 	
  x
 
 	
r   xc           	          t           j        }t          j        }t          j        }t          j        t          j                  j        }t          j        }t          | |t          |           ||||          S )zGThis is defined here instead of local function to support serialization)epsscale_dtypezero_point_dtype)	rT   
ASYMMETRICr   int8float32finfor  r   rY   )r   mapping_typetarget_dtyper  r  r  s         r   _int8_asymm_per_token_quantr    sh    )L:L-K
+em
$
$
(Cz#	!!$$)   r   c                     t           j        }t          j        }t          j        }t          j        t          j                  j        }t          j        }d}d}t          | |t          |           ||||||	  	        }|S )Nr      )	quant_min	quant_maxr  r  r  )
rT   r  r   uint8r  r  r  int32r   rY   )	r   r	  r
  r  r  r  r  r  outs	            r   _uint8_asymm_per_token_quantr    s|    )L;L-K
+em
$
$
(C{II
"	!!$$)
 
 
C Jr   c           
          t           j        }t          j        }d}d}d}t	          | |t          |           ||||t          j                  S Nh㈵>   r  r  r  r  )rT   	SYMMETRICr   r  r   rY   r  r   r	  r
  r  r  r  s         r   _int8_symm_per_token_quantr  4  sY    (L:L
CII#	!!$$M	 	 	 	r   c                       e Zd ZU dZdZeed<    e            Ze	ed<   e
j        Ze
ed<   e
j        Ze
ed<   dZeed<   d	 Zd
S )%Int8DynamicActivationInt4WeightConfiga  Configuration for applying int8 dynamic per token asymmetric activation quantization and int4 per group weight symmetric quantization to linear
    This is used to produce a model for executorch backend, but currently executorch did not
    support lowering for the quantized model from this flow yet

    Args:
        `group_size`: parameter for quantization, controls the granularity of quantization, smaller
         size is more fine grained
        `layout`: layout type for quantized weight tensor, only supports `MarlinQQQLayout()` and `CutlassInt4PackedLayout()` for now
        `mapping_type`: quantization type for weight, controls the weight quantization is symmetric or asymmetric
        `act_mapping_type`: quantization type for activation, controls the activation quantization is symmetric or asymmetric
        `set_inductor_config`: if True, adjusts `torchinductor` settings to recommended values.
        
group_sizelayoutr	  act_mapping_typeTset_inductor_configc                 l    t           j                            d           t          j        d           d S )Nz:torchao.quantization.Int8DynamicActivationInt4WeightConfigz`Int8DynamicActivationInt4WeightConfig` will be moving to prototype in a future release of torchao. Please see https://github.com/pytorch/ao/issues/2752 for more details.r   r   r   warningswarnr   s    r   __post_init__z3Int8DynamicActivationInt4WeightConfig.__post_init__\  E    $$H	
 	
 	
 	 y	
 	
 	
 	
 	
r   N)r   r   r   __doc__r   int__annotations__r   r!  r#   rT   r  r	  r  r"  r#  boolr(  rk   r   r   r  r  G  s           J []]FF""" + 5L+555$/$:k::: $$$$
 
 
 
 
r   r  r]   custom_scalecustom_zero_pointr   r/  r0  c                .   |j         }|j        }|j        }|j        }|j        r#t
          j        j                                         | j	        }||dk    r|j
        d         }|j
        d         |z  dk    r| S d|f}	t          j        }
d}d}|t          j        k    r%t          |t                     rt"          }ngt$          }n_|t          j        k    rBt          |t(                    rt*          }n2t          |t,                    rt.          }nt*          }nJ d|             t          |t(                    rt1          ||	|||          }nut          |t,                    rt3          |          }nPt          |t                     r#t5          |||	t          j        dd	|||
	  	        }nt5          |||	|
|||||	  	        }t9          ||          }t          j                            |d          | _	        t?          j         tB          |           | _"        | S )Nro   r   rE      F%Unsupported activation mapping type: )_layout   )r
  r  r  r5  r/  r0  )r5  r/  r0  r   )#r   r!  r	  r"  r#  torchaoquantizationutils"recommended_inductor_config_setterr   r   r   r  rT   r  r   r   r  r  r  r   r  r   _int8_symm_cutlass_quantr    _int4_symm_cutlass_quantr   r  rO   r   r   r   r   r   r   )r   r   r/  r0  r   r!  r	  r"  r   
block_sizer
  r  r  r   s                 r   ._int8_dynamic_activation_int4_weight_transformr>  k  sT    "J]F&L.! H"EEGGG]FZ2--\"%
|B*$)) ZJ:LII ;111f?@@ 	;;:	[2	2	2fo.. 	:9 788 	:79PP>NPPPPu&/** 
,J	9f
 
 
 
F3	4	4 
)&11	F=	>	> 
)%/

 

 

 *%/

 

 

 ,F4DEEFH&&vU&CCFM();VDDFMr   c                      e Zd ZU dZej        Zej        ed<    e	d          Z
eed<   ej        Zeed<   dZeej                 ed<   ej        Zeed<    e            Zeed	<   ej        Zeed
<   ej        Zeed<   dZeed<   d ZdS )%Int8DynamicActivationIntxWeightConfiga7	  
    Configuration for dynamically quantizing activations to torch.int8 and weights to torch.intx, with 1 <= x <= 8.
    More specifically, activations are dynamically quantized to 8-bits at a per-token granularity with scales/zeros.
    Weights are quantized with scales/zeros in a groupwise or channelwise manner using the number of bits specified by weight_dtype.

    This layout is identical to Int8DynamicActivationInt4WeightConfig when weight_dtype is torch.int4 and other args
    are the same.  However, this layout is more general and supports other weight dtypes.

    args:
        `weight_dtype`: The dtype to use for weight quantization.  Must be torch.intx, where 1 <= x <= 8.
       ` weight_granularity`: The granularity to use for weight quantization.  Must be PerGroup or PerAxis(axis=0).
        `weight_mapping_type`: The type of mapping to use for the weight quantization.
            Must be one of MappingType.ASYMMETRIC or MappingType.SYMMETRIC.  MappingType.SYMMETRIC requires ZeroPointDomain.NONE
        `weight_scale_dtype`: The dtype to use for the weight scale.
        `act_mapping_type`: The type of mapping to use for the activation quantization.
            Must be one of MappingType.ASYMMETRIC or MappingType.SYMMETRIC.
        `layout`: The layout to use for the packed weight tensor:
            - PackedLinearInt8DynamicActivationIntxWeightLayout: this layout is optimized for CPU performance.
            - QDQLayout: this layout represents the quantization with Q/DQ quant primitives, and is intended for
                export applications like ExecuTorch.
        `intx_packing_format`: The format to use for the packed weight tensor (version 2 only).
            - unpacked_to_int8: this format is the default and is intended for export applications like ExecuTorch.
            - opaque_torchao_auto: this format is optimized for CPU performance.
        `intx_choose_qparams_algorithm`: The algorithm to use for choosing the quantization parameters.
        `version`: version of the config to use, only subset of above args are valid based on version, see note for more details.

        Note:

        Current state for Int8DynamicActivationIntxWeightConfig is that it supports both v1 (legacy) and v2.

        * `intx_packing_format` is used for version 2.
        * `layout` is only used for version 1.
    weight_dtyper  weight_granularityweight_mapping_typeNweight_scale_dtyper"  r!  intx_packing_formatintx_choose_qparams_algorithmr   versionc                    t           j                            d           | j        d t	          dd          D             v sJ d| j                     t          | j        t          t          f          sJ d| j                     t          | j        t                    r'| j        j	        dk    sJ d| j        j	                     | j
        t          j        t          j        t          j        fv sJ d	| j
                     | j        t          j        t          j        fv sJ d
| j                     t          | j        t"          t$          f          sJ d| j                     t          | j        t"                    rw| j        j        t(          j        t(          j        t(          j        fv rJ| j        | j        t           j        k    r0t5          j        d| j        j         d| j         d           d S d S d S d S )Nz:torchao.quantization.Int8DynamicActivationIntxWeightConfigc                 >    g | ]}t          t          d |           S r+  r   r   .0bs     r   
<listcomp>zGInt8DynamicActivationIntxWeightConfig.__post_init__.<locals>.<listcomp>  (    $T$T$T1WUI!II%>%>$T$T$Tr   rE   	   <weight_dtype must be torch.intx, where 1 <= x <= 8, but got z8weight_granularity must be PerAxis or PerGroup, but got r   zaxis must be 0, but got z~weight_mapping_type must be MappingType.ASYMMETRIC or MappingType.SYMMETRIC or MappingType.SYMMETRIC_NO_CLIPPING_ERR, but got zRact_mapping_type must be MappingType.ASYMMETRIC or MappingType.SYMMETRIC, but got zWlayout must be PackedLinearInt8DynamicActivationIntxWeightLayout or QDQLayout, but got zPWhen using layout PackedLinearInt8DynamicActivationIntxWeightLayout with target z[, the weight scale may be cast to bfloat16 by the kernel, but weight_scale_dtype is set to z. Explicitly set weight_scale_dtype to torch.bfloat16 to suppress this warning. If you need weight_scale_dtype = torch.float32, use target=Target.UNIVERSAL instead.)r   r   r   rA  ranger   rB  rJ   rK   axisrC  rT   r  r  SYMMETRIC_NO_CLIPPING_ERRr"  r!  r   r   targetr!   AUTOKLEIDIAIATENrD  bfloat16loggingwarningr   s    r   r(  z3Int8DynamicActivationIntxWeightConfig.__post_init__  s   $$H	
 	
 	
  $T$TaQR$T$T$TTTT^4K\^^ UTT $1GX3FGG 	
 	
`tG^``	
 	
G d-w77 	*/1444I4+B+GII 544 '"!1,
 
 
 

 h  NR  Nf  h  h
 
 
 $"!)
 
 
 
 yaeavxx	
 
 
 KKYW
 
 	
 	
 tfjfqss	
 	
 
 dk#TUU 
	{!fk6?FK%PPP+4+u~==Ookokvk} o otx  uLo o o    
	 
	PP==r   )r   r   r   r*  r   r  rA  dtyper,  rK   rB  rI   rT   r  rC  rD  r
   r  r"  r   r!  r#   r:   UNPACKED_TO_INT8rE  r8   AFFINErF  rG  r+  r(  rk   r   r   r@  r@    s            D !&
L%+***&.hrll222'2'<<<<04-444$/$:k:::Y[[FF   ->-O*OOO") "#=    GS+ + + + +r   r@  c                r   |j         }|j        }|j        }|j        }|j        }	|j        }
|j        }|j        }|                                 dk    sJ d|                                              t          |t                    r|j        }nRt          |t                    r+|j        dk    sJ d|j                     | j        d         }nt          d|           d|f}|j        dk    r|	t"          j        k    sJ t&          j        t&          j        t&          j        t&          j        g}|t&          j        k    s||v sJ d|             |4|j        t4          j        k    r|                    t4          j                  }t=          j        | |||d	|||
          }||| j        k    rtA          || |           |}||v rtC          j"        |||          }d }||fS |j        dk    sJ |tF          j$        k    s
J d            tK          j&        d           tN          |         \  }}|t"          j(        t"          j)        fv }tU          | |d|ft4          j        |||t4          j        |tV          j,        t[                                } t          |
tZ                    rN|	t"          j        k    rt\          }n%|	t"          j(        k    rt^          }nJ d|	             ta          | |          } nt          |
tb                    r|	t"          j        k    s
J d            | j2        3                                \  }}}| j        d         |z  }|4                    d|          }|J |4                    d|          }|dk    5                                }tm          |||r|nd |||
j7        d          } d }| |fS )Nr   zFInt8DynamicActivationIntxWeightConfig only works for 2-d Tensor, got: r   %axis must be 0 with PerAxis, but got ro   z4weight_granularity must be PerGroup or PerAxis, got rE   Unsupported packing format: int8_asym_per_token)r	  activation_quantizationrF  r/  r0  )r   rE  zOIntxChooseQParamsAlgorithm.AFFINE is the only supported algorithm for version 1zConfig Deprecation: version 1 of Int8DynamicActivationIntxWeightConfig is deprecated and will no longer be supported in a future release, please use version 2, see https://github.com/pytorch/ao/issues/2967 for more detailsinput_floatr	  r=  r
  r  r  r  r  preserve_zerozero_point_domainr5  Fr4  zbPackedLinearInt8DynamicActivationIntxWeightLayout requires act_mapping_type=MappingType.ASYMMETRIC)validate_inputs)8rA  rB  rC  rD  r"  r!  rE  rF  dimr   rK   r   rJ   rT  r   r   rG  rT   r  r:   OPAQUE_ATEN_KLEIDIAIOPAQUE_TORCHAO_AUTOOPAQUE_TORCHAO_KLEIDIAIOPAQUE_TORCHAO_LOWBITr^  r]  r   r  rs   r  r;   from_hp+_adjust_scale_dtype_in_intx_unpacked_tensorr9   !from_intx_unpacked_to_int8_tensorr8   r_  r&  r'  rS   r  rU  r   rU   INTr   r  r  rO   r   tensor_impl	get_plainreshapeanyr"   rV  )r   r   r   r/  r0  rA  rB  rC  rD  r"  r!  rE  rF  r   r=  opaque_formats
new_weightnew_biasr  r  rg  activation_quant_funcdatascale
zero_pointgroups_per_rowhas_weight_zeross                              r   4_int8_dynamic_activation_intx_weight_quantize_tensorr    s    &L2 42.]F 4$*$H!::<<1_QWQ[Q[Q]Q]__  $h// 

'2

	&	0	0 
!&!+++M4F4KMM ,++ \"%

WCUWW
 
 	
 ZJ~;#999992153	
  #4#EEE"n444?*=?? 545 (->-D-S-S 1 4 4UZ @ @-5,$9*G%/	
 	
 	

 ).@FL.P.P7F$6    .00)K?R  J H8## >Q(,F,MMMMY NMM M 	i   3<@Iy (-, M
 &(z?Z&#)-  F &)$$ #
 {555$?!!!666$>!!TTBRTTTT5/8MNN	FM	N	N   ;#9999p :99 #)"4">">"@"@eZb)Z7b.11%%%''N;;
&!O0022N*4JJM!
 
 
 4<r   c                   t          | j        | j        |||          \  }}t          j                            |d          | _        |d | _        t          | t          j                  rt          j	        t          |           | _        | S )Nr.  Fr   )r  r   r   r   r   r   r   r   r   r   r   r   )r   r   r/  r0  rx  ry  s         r   ._int8_dynamic_activation_intx_weight_transformr    s     P!+  J H&&z&GGFM&")$$ I!,-?HHMr   c                   v    e Zd ZU dZ e            Zeed<   ej	        Z
eed<   ej	        Zeed<   dZeed<   d ZdS )	%Int4DynamicActivationInt4WeightConfiga^  Applies int4 dynamic per token symmetric activation quantization and int4 per row weight symmetric quantization to linear

    Args:
        `layout`: layout type for quantized weight tensor, only supports `MarlinQQQLayout()` and `CutlassInt4PackedLayout()` for now
        `mapping_type`: quantization type for weight, controls the weight quantization is symmetric or asymmetric
        `act_mapping_type`: quantization type for activation, controls the activation quantization is symmetric or asymmetric
        `set_inductor_config`: if True, adjusts `torchinductor` settings to recommended values.
    r!  r	  r"  Tr#  c                 l    t           j                            d           t          j        d           d S )Nz:torchao.quantization.Int4DynamicActivationInt4WeightConfigz`Int4DynamicActivationInt4WeightConfig` will be moving to prototype in a future release of torchao. Please see https://github.com/pytorch/ao/issues/2752 for more details.r%  r   s    r   r(  z3Int4DynamicActivationInt4WeightConfig.__post_init__  r)  r   N)r   r   r   r*  r   r!  r#   r,  rT   r  r	  r"  r#  r-  r(  rk   r   r   r  r    s           -,..FF... + 5L+555$/$9k999 $$$$
 
 
 
 
r   r  #int4_dynamic_activation_int4_weightc                 0   | j         }|j        }|j        }|j        }|j        r#t
          j        j                                         t          |t                    st          d| d          |t          j        k    rt          d          |t          j        k    rt          d          t          |          }t          |t                    }t           j                            |d          | _         t'          j        t*          |           | _        | S )Nz;Only CutlassInt4PackedLayout layout is supported. Received rr   z)Only mapping_type=SYMMETRIC is supported.z-Only act_mapping_type=SYMMETRIC is supported.Fr   )r   r!  r	  r"  r#  r7  r8  r9  r:  r   r   NotImplementedErrorrT   r  r<  rO   r   r   r   r   r   r   r   )r   r   r   r!  r	  r"  s         r   ._int4_dynamic_activation_int4_weight_transformr    s    ]F]F&L.! H"EEGGGf566 
!S&SSS
 
 	
 {,,,!"MNNN;000!"QRRR%f--F+  F H&&vU&CCFM();VDDFMr   c                       e Zd ZU dZdZee         ed<   dZeed<   dZ	ee         ed<   dZ
ee         ed	<   d
Zeed<   d ZdS )GemliteUIntXWeightOnlyConfiga#  
    applies weight only 4 or 8 bit integer quantization and utilizes the gemlite triton kernel and its associated weight packing format.
    This only works for fp16 models. 8 bit quantization is symmetric, 4 bit quantization is asymmetric.

    Args:
        `group_size`: parameter for quantization, controls the granularity of quantization, smaller
         size is more fine grained
        `bit_width`: bit width of the quantized weight.
        `packing_bitwidth`: bit width of the packed weight, should be 8 or 32. Can have performance impacts depending on hardware.
        `mode`: if set to "dynamic", activations are quantized at runtime; default is "weight_only" (weight-only quantization).
        `set_inductor_config`: if True, adjusts `torchinductor` settings to recommended values.
       r      	bit_widthNpacking_bitwidthweight_onlymodeTr#  c                 l    t           j                            d           t          j        d           d S )Nz1torchao.quantization.GemliteUIntXWeightOnlyConfigz`GemliteUIntXWeightOnlyConfig` will be moving to prototype in a future release of torchao. Please see https://github.com/pytorch/ao/issues/2752 for more details.r%  r   s    r   r(  z*GemliteUIntXWeightOnlyConfig.__post_init__  sE    $$?	
 	
 	
 	 p	
 	
 	
 	
 	
r   )r   r   r   r*  r   r
   r+  r,  r  r  r  strr#  r-  r(  rk   r   r   r  r    s           !$J###Is&*hsm***'D(3-''' $$$$
 
 
 
 
r   r  re   c                    |j         }|j        }|j        }|j        }|j        r#t
          j        j                                         | j	        }ddl
m} |dk    rdnd}t          |fi  |||||||          }	t          j                            |	d          | _	        t!          j        t$          |           | _        | S )Nr   )get_gemlite_aqt_kwargsr  TF)r   r  r  r  use_hqqr   )r   r  r  r  r#  r7  r8  r9  r:  r   -torchao.prototype.dtypes.uintx.gemlite_layoutr  r   r   r   r   r   r   r   r   )
r   r   r   r  r  r  r   r  r  rx  s
             r   $_gemlite_uintx_weight_only_transformr  #  s     "J I.;D! H"EEGGG]FTTTTTT1nndd%G)
 

 
 !-
 
 

 
J H&&z&GGFM();VDDFMr   c                       e Zd ZU dZdZeed<    ed          Ze	e         ed<   dZ
eed<   ej        Ze	e         ed	<   d
Zeed<   dZe	e         ed<   ej        Zeed<   ej        Zeed<   dZeed<   d ZdS )Int4WeightOnlyConfiga  
    Configuration for int4 weight only quantization, only groupwise quantization is supported
    right now, and we support version 1 and version 2, that are implemented differently although with
    same support. In version 2, different target are mainly distinguished by `packing_format` arg, and in version 1, mainly by `layout`.

    Args:
        `group_size`: parameter for quantization, controls the granularity of quantization, smaller
         size is more fine grained, choices are [256, 128, 64, 32], used in both version 1 and 2
        `int4_packing_format`: the packing format for int4 tensor, used in version 2 only
         `int4_choose_qparams_algorithm`: variants of choose qparams algorithm to use for int4,
         currently support TINYGEMM ("tinygemm") and HQQ ("hqq"), used in version 2 only
        `layout`: layout type for quantized tensor, default is `TensorCoreTiledLayout(inner_k_tiles=8)`, used in version 1 only
        `use_hqq`: whether to use hqq or default quantization mode, default is False, used in version 1 only
        `zero_point_domain`: data type of zeros points, choices are [ZeroPointDomain.FLOAT, ZeroPointDomain.INT, ZeroPointDomain.NONE], used in version 1 only
        `set_inductor_config`: if True, adjusts `torchinductor` settings to recommended values. used in both version 1 and 2
        `preserve_zero`: whether to preserve zero, default is None. Will be set to True if zero_point_domain is ZeroPointDomain.INT, used in version 1 only
        `version`: version of the config to use, only subset of above args are valid for version 1, and subset of above args are valid for version 2, default is 2, see note for more details

    Note:
        Current state for Int4WeightOnlyConfig is that it supports both v1 (legacy) and v2

        For v2 (version = 2), only `group_size`, `int4_packing_format`, `int4_choose_qparams_algorithm` and `set_inductor_config` are valid, all other args will be ignored
        For v1 (version = 1), only `group_size`, `layout`, `use_hqq`, `zero_point_domain`, `preserve_zero` and `set_inductor_config` are valid, we plan to deprecate v1 in torchao 0.15 to make this config
        less confusing
    r  r      )inner_k_tilesr!  Fr  rh  Tr#  Nrg  int4_packing_formatint4_choose_qparams_algorithmr   rG  c                 D    t           j                            d           d S )Nz)torchao.quantization.Int4WeightOnlyConfigr   r   r   r   s    r   r(  z"Int4WeightOnlyConfig.__post_init__l  s    $$%PQQQQQr   )r   r   r   r*  r   r+  r,  r   r!  r
   r  r-  rU   NONErh  r#  rg  r2   PLAINr  r0   TINYGEMMr  rG  r(  rk   r   r   r  r  C  s          4 J.C.CRS.T.T.TFH*+TTTGT3B3Gx0GGG $$$$$(M8D>(((->-D*DDD"+ "#=    GSR R R R Rr   r  r`   c                 .   |j         }|j        }|j        }|j        }|j        }|j        }| j        d         |z  dk    r't                              d| j         d|            | S t          d t          | j        dz
            D             |gz             }|j        dk    r t          |          }|t          j        k    r|t           j        k    sJ d| d	            |t           j        k    r#t'          j        | |t*          j        
          }	|	S |t           j        k    rt1          j        | |          }	|	S |t           j        k    rt5          j        | |          }	|	S |t           j        k    rt9          j        | |          }	|	S |t           j        k    rt;          j        | ||          }	|	S t=          d|           |j        dk    sJ t?          j         d           tB          j"        }
t*          j#        }d}d}d}tI          |tJ                    r| j&        nt*          j        }tO          |          tP          )                                v s$J dtP          )                                             |tT          j+        k    r!tP          tO          |                   d         }n4|tP          tO          |                   v sJ dtP          |                      |tT          j,        k    r!tI          |tZ                    rt*          j#        }|j.        |j.        nt^          tO          |                   }tI          |t`                    r0tB          j1        }
|dk    s|| j        d         k    sJ d|             te          | |
||||||||||          }	|	S )Nro   r   zZSkipping quantizing weight with int4 weight only quantization because the shape of weight z# is not compatible with group_size c                     g | ]}d S rE   rk   rM  r   s     r   rO  z5_int4_weight_only_quantize_tensor.<locals>.<listcomp>      :::a:::r   rE   r   zBInt4ChooseQParamsAlgorithm.HQQ is not supported by packing format zF, it's only supported by Int4PackingFormat.TILE_PACKED_TO_4D currentlyactivation_dtype)r  z!Unsupported int4 packing format: zConfig Deprecation: version 1 of Int4WeightOnlyConfig is deprecated and will no longer be supported in a future release, please use version 2, see https://github.com/pytorch/ao/issues/2948 for more detailsr6  gư>zOnly support layout: zLayout only support r  zQMarlinSparseLayout only supports 128 group size or per channel quantization, got )r  rg  rh  r5  r  )3r   r!  r  r  rh  r  r   loggerinfotuplerS  ndimrG  rt   r0   HQQr2   TILE_PACKED_TO_4DPRESHUFFLEDr4   ro  r   rZ  r  r5   PLAIN_INT32r3   MARLIN_SPARSEr1   r6   r   r&  r'  rT   r  r  r   r   r]  r   LAYOUT_TO_ZERO_POINT_DOMAINkeysrU   r  rr  r   rg  LAYOUT_TO_PRESERVE_ZEROSr   r  r   )r   r   r   r!  r  r  rh  r  r=  rx  r	  r
  r  r  r  r  rg  s                    r   !_int4_weight_only_quantize_tensorr  u  s6    "J]FnG$*$H!0 4|B*$)) gioiu  g  g  [e  g  g	
 	
 	
 ::5q#9#9:::j\IJJJ~*%%
(,F,JJJ&*;*MMMMXUh X X X NMM
 "3"???.6!&  J
  $5$;;;#+ J  $5$AAA-5 J  $5$CCC/7 J  $5$GGG19.K  J
 VATVVWWW>QM 	X   )L;LII
C"6=99Mu~ 
 <<6;;=====D ; @ @ B BDD >== O0007VEaH $?V$MMMMH#>v#FHH NMM O///Jv}4U4U/ ; + 	%d6ll3  &,-- 
",S  J&,r2B$B$B$Bl`jll %C$BB *)#+  J r   c                 B   |j         r#t          j        j                                         t          | d          s
J d            t          | j        |          }t          j	        
                    |d          | _        t          j        t          |           | _        | S )Nr   gapplying int8 weight only quant requires module to have weight attribute but {module} does not have oneFr   )r#  r7  r8  r9  r:  r   r  r   r   r   r   r   r   r   r   r   r   rx  s      r   _int4_weight_only_transformr    s     ! H"EEGGG68$$  	, $ 36=&IIJH&&z&GGFM();VDDFMr   c                   "    e Zd ZU dZdZeed<   dS )'Float8DynamicActivationInt4WeightConfigai  Configuration for apply float8 dynamic per row quantization and int4
    per group weight quantization to linear
    (only group_size 128 is supported right now since underlying kernel used only supports 128
    and above and no benefits of making it bigger)

    Args:
        `int4_packing_format`: how the weight is packed, only preshuffled is supported
    preshuffledr  N)r   r   r   r*  r  r2   r,  rk   r   r   r  r    s0           .;*:::::r   r  c                    t          | d          s
J d            |j        }|dk    sJ d|             | j        }d}t          d t	          |j        dz
            D             |gz             }t          j        | j        |t          j	                  }t          j
                            |d	
          | _        t          j        t          |           | _        | S )Nr   r  r  z?only preshuffled int4_packing_format supported right now, got: r  c                     g | ]}d S r  rk   r  s     r   rO  zD_float8_dynamic_activation_int4_weight_transform.<locals>.<listcomp>   r  r   rE   r  Fr   )r   r  r   r  rS  r  r4   ro  r   float8_e4m3fnr   r   r   r   r   r   )r   r   r  r   r   r=  rx  s          r   0_float8_dynamic_activation_int4_weight_transformr    s     68$$  	, $ !4-///_J]__ 0// ]FJ::5q#9#9:::j\IJJJ&.,  J
 H&&z&GGFM();VDDFMr   c                   z    e Zd ZU dZdZee         ed<    e            Z	ee
         ed<   dZeed<   dZeed<   d	 ZdS )
Int8WeightOnlyConfiga  
    Configuration for applying int8 weight-only symmetric per-channel quantization to linear layers.

    Args:
        group_size (version 1) - Controls the granularity of quantization.
        If None, applies per-channel quantization. Otherwise, applies per-group quantization with the specified group size.
        granularity (version 2) - Quantization granularity.
            PerRow() for per-channel quantization, PerTensor() for per-tensor quantization.
        set_inductor_config: bool = True - If True, adjusts `torchinductor` settings to recommended values
            for better performance with this quantization scheme.
    Nr   granularityTr#  rE   rG  c                     t           j                            d           | j        dk    r| j        J d| j                     d S d S )Nz)torchao.quantization.Int8WeightOnlyConfigr   z1Only support version 2 with group_size=None, got )r   r   r   rG  r   r   s    r   r(  z"Int8WeightOnlyConfig.__post_init__>  sW    $$%PQQQ<1?**UDOUU +** **r   )r   r   r   r*  r   r
   r+  r,  rL   r  rI   r#  r-  rG  r(  rk   r   r   r  r  +  s         
 
 !%J$$$)/K+&111 $$$$GS    r   r  ra   c                    |j         dk    rt          j        d           t          j        }t
          j        }t          j        t
          j                  j	        }t
          j
        }|j        }|| j        d         }t          d t          |                                 dz
            D             |gz             }t!          | |||||          }n8|j         dk    sJ d|j                      t#          j        | |j                  }|S )	NrE   zConfig Deprecation: version 1 of Int8WeightOnlyConfig is deprecated and will no longer be supported in a future release, please use version 2, see https://github.com/pytorch/ao/issues/2752 for more detailsro   c                     g | ]}d S r  rk   )rM  r   s     r   rO  z5_int8_weight_only_quantize_tensor.<locals>.<listcomp>V      ???!A???r   )r  r  r   Unexpected version: )r  )rG  r&  r'  rT   r  r   r  r  r  r  int64r   r   r  rS  rj  r   r7   ro  r  )	r   r   r	  r
  r  r  r   r=  rx  s	            r   !_int8_weight_only_quantize_tensorr  J  s   ~ \	
 	
 	
 #,zk%-((, ;&
b)J??uVZZ\\A-='>'>???:,NOO
--
 
 


 ~"""$K6>$K$K"""'F<NOOO
r   r   r   c                   |j         r#t          j        j                                         t          | |          s
J d            t          t          | |          |          }t          | |t          j
                            |d                     t          j        t          t          | j        |          |           | _        | S )Nzqapplying int8 weight only quant requires module to have {parameter_name} attribute but {module} does not have oneFr   r   r   )r#  r7  r8  r9  r:  r   r  r   rw   r   r   r   r   r   r   r   r   r   r   r   quantized_tensors       r   _int8_weight_only_transformr  e  s     ! H"EEGGG6>**  	, * 9''  +5AA  
 ( & 1)	
 	
 	

 	 F Mr   c                     t           j        }t          j        }d}d}d}t	          | |t          |           ||||| j        t          j        k    rt          j        nd           S r  )	rT   r  r   r  r   rY   r]  float16r  r  s         r   (_int8_symm_per_token_reduced_range_quantr    sk    (L:L
CII#	!!$$%&W%=%=EMM4	 	 	 	r   c                     t           j        }t          j        }d}d}d}| j        d         dk    r| S t          | |t          |           ||||| j        t          j        k    rt          j	        nd           S )Nr  r  r  rE   r  )
rT   r  r   r  r   r   rY   r]  r  r  r  s         r   4_int8_symm_per_token_reduced_range_quant_noop_decoder    s     (L:L
CIIwqzQ'%a(()*EM)A)At	
 	
 	
 		
r   c           
          t          | t          j        t          |           t          j        t          j        t	          j        t          j                  j        t          j
                  S )N)r	  r=  r
  r  r  rh  )r   rT   r  rY   r   r  r  r  r  rU   r  r   s    r   r;  r;    sO    #	 *,Q//ZMK&&*).   r   c                     t          | t          j        t          |           t          j        ddt          j        t	          j        t          j                  j        t          j
        t                      
  
        S )Nr2  r3  )	r	  r=  r
  r  r  r  r  rh  r5  )r   rT   r  rY   r   r  r  r  r  rU   r  r   r  s    r   r<  r<    s^    #	 *,Q//ZMK&&*).'))   r   r
  c           	      t    t          | t          |           t          j        |t	          d                     S )N	mm_configr=  r  r
  r5  )r   rY   r   r  r   r   r
  s     r   _float8_cutlass_quantr    s?     &	,Q//M!t,,,   r   c                 p    t          | t          |           t          j        |t	                                S )Nr  )r   rY   r   r  r   r  s     r   _float8_cutlass_quant_sparser    s:     &	,Q//M!'))   r   c                       e Zd ZU dZ e            Zee         ed<   e	j
        Zee	         ed<   dZeed<    e            Zeed<   dZeed<   d	Zeed
<   d ZdS )%Int8DynamicActivationInt8WeightConfiga  
    Configuration for applying int8 dynamic symmetric per-token activation and int8 per-channel weight
    quantization to linear layers.

    Args:
        layout: Optional[Layout] = PlainLayout() - Tensor layout for the quantized weights. Controls how the
            quantized data is stored and accessed.
        act_mapping_type: Optional[MappingType] = MappingType.SYMMETRIC - Mapping type for activation quantization.
            SYMMETRIC uses symmetric quantization around zero.
        weight_only_decode: bool = False - If True, only quantizes weights during forward pass and keeps activations
            in original precision during decode operations.
        set_inductor_config: bool = True - If True, adjusts `torchinductor` settings to recommended values
            for better performance with this quantization scheme.
        version (int): the version of the config, version 1 is using AffineQuantizedTensor that we plan to deprecate/split, version 2 is using Int8Tensor
    r!  r"  Fweight_only_decoder  Tr#  rE   rG  c                 D    t           j                            d           d S )Nz:torchao.quantization.Int8DynamicActivationInt8WeightConfigr  r   s    r   r(  z3Int8DynamicActivationInt8WeightConfig.__post_init__  s)    $$H	
 	
 	
 	
 	
r   N)r   r   r   r*  r   r!  r
   r#   r,  rT   r  r"  r  r-  rL   r  rI   r#  rG  r+  r(  rk   r   r   r  r    s             +{}}FHV,,,.9.Ch{+CCC$$$$%vxxK''' $$$$GS
 
 
 
 
r   r  r^   c           
      h   |j         dk    r|j        }|j        }|j        }| j        d         }|dk    r't
                              d| j         d|            | S t          j        }t          j
        }d }t          j        }	t          j        t          j                  j        }
t          j        }|rt"          }n|t          j        k    rt$          }nt&          } ||           }t)          | |||	|
|||          }t+          ||          }ndd	lm} |j        t3                      t5                      hv s
J d
            |j        }|j        }|j        t          j        k    s
J d            |j         dk    sJ d|j                      t7          j        | | |||j                            }|S )NrE   ro      zKSkipping applying Int8DynamicActivationInt8WeightConfig to weight of shape z  because `in_feature` is <= 16: c                     t          d t          |                                 dz
            D             | j        d         gz             S )Nc                     g | ]}d S r  rk   r  s     r   rO  zg_int8_dynamic_activation_int8_weight_quantize_tensor.<locals>.get_weight_block_size.<locals>.<listcomp>  s    888!888r   rE   ro   )r  rS  rj  r   r  s    r   get_weight_block_sizezS_int8_dynamic_activation_int8_weight_quantize_tensor.<locals>.get_weight_block_size  s@    88U15577Q;%7%7888AGBK=HIIIr   )r  r  r5  rh  r   )QuantizeTensorToInt8Kwargsz'Only PerRow and PerTensor are supportedz0asymmetric dynamic quant not supported currentlyr   r  )r  r	  )r  act_quant_kwargs)rG  r!  r"  r  r   r  r  rT   r  rU   r  r   r  r  r  r  r  r  r  r  r   rO   9torchao.quantization.quantize_.workflows.int8.int8_tensorr  r  rL   rM   r7   ro  )r   r   r!  r"  r  in_featuresr	  weight_zero_point_domainr  r
  r  r  r   r=  rx  quantized_weightr  rB  act_granularitys                      r   4_int8_dynamic_activation_int8_weight_quantize_tensorr  	  s?   ~!2#6l2&"KKA^d^j A A3>A A   M #,#2#7 	J 	J 	J zk%-((, ; 	?S  ;#888#K  #> **622
--6	
 	
 	

 :*FVWW	
 	
 	
 	
 	
 	
 !fhh	%<<<<5 =<< $/ ,&+*????> @?? ~"""$K6>$K$K""" &-*77+#4  
 
 
 r   c                 B   |j         r#t          j        j                                         t          | d          s
J d            t          | j        |          }t          j	        
                    |d          | _        t          j        t          |           | _        | S )Nr   zyapplying int8 dynamic activation int8 weight quant requires module to have weight attributebut {module} does not have oneFr   )r#  r7  r8  r9  r:  r   r  r   r   r   r   r   r   r   r   r  s      r   ._int8_dynamic_activation_int8_weight_transformr  V  s     ! H"EEGGG68$$  	+ $ Fv J H&&z&GGFM();VDDFMr   c                  b    t          j        d           t          t                                S )z
    Applies int8 dnynamic symmetric per-token activation and int8 per-channel weight
    quantization + 2:4 sparsity to linear layers.
    a  int8_dyanmic_activation_int8_semi_sparse_weight() will be deprecated at a later release. Please use the layout kwarg in Int8DynamicActivationInt8WeightConfig instead.

    from torchao.dtypes import SemiSparseLayout
    Int8DynamicActivationInt8WeightConfig(layout=SemiSparseLayout())r!  )r&  r'  r  r   rk   r   r   r_   r_   i  s8    
 M	G   18H8J8JKKKKr   c                   N    e Zd ZU dZeZej        ed<   dZ	e
ed<   dZeed<   d ZdS )	Float8WeightOnlyConfigaV  
    Configuration for applying float8 weight-only symmetric per-channel quantization to linear layers.

    Args:
        weight_dtype (torch.dtype): The target data type for weight quantization. Default is torch.float8_e4m3fn.
        set_inductor_config (bool): if True, adjusts `torchinductor` settings to recommended values.
        version (int): the version of the config, version 1 is using AffineQuantizedTensor that we plan to deprecate/split, version 2 is using Float8Tensor (default)

    Note:
        The actual matmul will be computed in original precision of the weight tensor.
    rA  Tr#  r   rG  c                 D    t           j                            d           d S )Nz+torchao.quantization.Float8WeightOnlyConfigr  r   s    r   r(  z$Float8WeightOnlyConfig.__post_init__  s    $$%RSSSSSr   N)r   r   r   r*  r$   rA  r   r]  r,  r#  r-  rG  r+  r(  rk   r   r   r  r  x  si         
 
 !+L%+*** $$$$GST T T T Tr   r  rb   c           	         |j         dk    rt          j        d           ddlm} t          d t          |                                 dz
            D             | j        d         gz             } || ||j	        d t          d                     }nG|j         d	k    sJ d
|j                      |j	        }t          j        | |t                                }|S )NrE   zConfig Deprecation: version 1 of Float8WeightOnlyConfig is deprecated and will no longer be supported in a future release, please use version 2, see https://github.com/pytorch/ao/issues/2649 for more detailsr   )r   c                     g | ]}d S r  rk   r  s     r   rO  z4_float8_weight_only_quant_tensor.<locals>.<listcomp>  r  r   ro   r  rf  r=  r
  r  r5  r   r  )float8_dtyper  )rG  r&  r'  torchao.dtypesr   r  rS  rj  r   rA  r   r/   ro  rL   )r   r   r   r=  rx  rA  s         r    _float8_weight_only_quant_tensorr    s   ~ ^	
 	
 	
 	>=====??uVZZ\\A-='>'>???6<PRCSBTTUU
//!, 4000
 
 


 ~"""$K6>$K$K"""*!)688
 
 

 r   c                   |j         r#t          j        j                                         t          | |          s
J d            t          | t                    rt          |           } t          t          | |          |          }t          | |t          j                            |d                     t          j        t#          t$          | j        |          |           | _        | S )Nzsapplying float8 weight only quant requires module to have {parameter_name} attribute but {module} does not have oneFr   r  )r#  r7  r8  r9  r:  r   r   r&   _unwrap_float8_linearr  r   rw   r   r   r   r   r   r   r   r   r  s       r   _float8_weight_only_transformr    s    ! H"EEGGG6>**  	, *
 &,'' /&v..7''  +5AA  
 ( & 1)	
 	
 	

 	 F Mr   activation_granularityr  r|  r}  c           	         |
J d            t          |t                    r| j        t          j        k    s
J d            t          | j        |          }|-t          | ||t          j        t          d                    }nAt          |t                    s
J d            t          | |||t          d                    }|S )zThis function is used to quantize the input activation tensor for an aqt_float variant. If scale
    is not provided it will be dynamically calculate the scales otherwise it will use the provided scale.
    Nz8Zero point is not supported for dynamic FP8 quantizationzFPerRow quantization only works for bfloat16 precision input activationr  r  7Static quantization only supports PerTensor granularity)rf  r=  r|  r
  r5  )r   rL   r]  r   rZ  r?   r   r   r  r   rM   r   )r   r  r  r|  r}  r=  
activations          r    _input_activation_quant_func_fp8r    s     B  (&11 
w%.(((T )((  )?@@J}/!) 4000
 
 


 0)<< 	
 	
E	
 	
< 7!) 4000
 
 

 r   c           	         |                                  dv s J d|                                   d            | j        dd         \  }}|dz  dk    o|dz  dk    }|s)t                              d| j         d	| d
| d           |S )z
    Check if a weight tensor meets float8 quantization requirements.

    Args:
        weight (torch.Tensor): The weight tensor to check

    Returns:
        bool: True if the tensor can be quantized to float8, False otherwise
    )r   r   z6float8 quantization only works for 2/3-D tensors, got zD tensorr   Nr  r   z+Skipping float8 quantization: weight shape z: is not compatible with _scaled_mm. Both input dimension (z) and output dimension (z) must be multiples of 16. )rj  r   r  r  )r   out_dimin_dimis_compatibles       r   _fp8_mm_compatr    s     ::<<     W

VVV  
 l233'OGVb[A%>GbLA,=M 
j&, j j%+j jELj j j	
 	
 	

 r   c                      e Zd ZU dZeZej        ed<   eZ	ej        ed<   dZ
eeeee         f                  ed<   dZee         ed<   dZee         ed<   dZee         ed<   ej        Zeed	<   d
Zeed<   dZeed<   d ZdS ))Float8DynamicActivationFloat8WeightConfigax  
    Configuration for applying float8 dynamic symmetric quantization to both activations and weights of linear layers.

    Args:
        activation_dtype (torch.dtype): The target data type for activation quantization. Default is torch.float8_e4m3fn.
        weight_dtype (torch.dtype): The target data type for weight quantization. Default is torch.float8_e4m3fn.
        granularity (Optional[Union[FP8Granularity, List[FP8Granularity]]]):
            The granularity for quantization. Can be either a single granularity (applied to both
            activations and weights) or a tuple of two granularities (one for activations, one for weights).
            If None, defaults to PerTensor for both. Currently both quantizations need to be the same type. And
            only PerTensor and PerRow are supported.
        mm_config (Float8MMConfig): Configuration for the matrix multiplication. Default uses fast accumulation.
        activation_value_lb (Optional[float]): the lower bound for activation value for calculating scale
        activation_value_ub (Optional[float]): the upper bound for activation value for calculating scale
        kernel_preference (KernelPreference): kernel preference for ops like matmul, grouped matmul etc. by defalut (KernelPreference.AUTO) it will be chosen for user based on hardware or other information, this only needs to be set in weight
        set_inductor_config (bool): if True, adjusts `torchinductor` settings to recommended values.
        version (int): the version of the config, version 1 is using AffineQuantizedTensor that we plan to deprecate/split, version 2 is using Float8Tensor (default)

    r  rA  Nr  r  activation_value_lbactivation_value_ubkernel_preferenceTr#  r   rG  c                 l   t           j                            d           t          | j                  \  }}||g| _        d}t          | j                  r@| j        t          j        t          j	        fv s
J d            | j
        dk    s
J d            d}| j        t          |          | _        d S d S )Nz>torchao.quantization.Float8DynamicActivationFloat8WeightConfigTunimplementedr   Fuse_fast_accum)r   r   r   r+   r  r*   r  r.   rW  TORCHrG  r  r'   )r   r  rB  default_use_fast_accums       r   r(  z7Float8DynamicActivationFloat8WeightConfig.__post_init__6  s    $$L	
 	
 	
 6L6
 6
2 2 34FG!%,T-=>> 	+) % &.       <1$$$o$$$%*">!+;QRRRDNNN "!r   )r   r   r   r*  r$   r  r   r]  r,  rA  r  r
   r   r(   r	   r  r'   r  floatr  r.   rW  r  r#  r-  rG  r+  r(  rk   r   r   r  r    s          ( %/ek... *L%+***IMK%^0D DEFMMM*.Ix'...+/%///+/%///*:*?'??? $$$$GSS S S S Sr   r  rf   c           	         |j         }|j        }|j        }|j        }|j        }|j        }|j        }t          |           |\  }	}
|                                 dv r_t          |	t                    rt          |
t                    s
J d            | j        d         dz  dk    s| j        d         dz  dk    r| S nt          |           s| S t          |
t                    r| j        t          j        k    s
J d            |j        dk    rt%          j        d           t)          | j        dd          |
          }|                                 d	k    r t+          dgt-          |          z             }t/          | ||t          j        t3          |
                    }t4          }|	|d}t7          |||          }nK|j        dk    sJ d|j                     t9          ||	|||          }t;          j        | ||
|||          }|S )N)r     zH4D/5D tensor only supports per tensor activation and weight quantizationr   r  rE   zBPerRow quantization only works for bfloat16 precision input weightzConfig Deprecation: version 1 of Float8DynamicActivationFloat8WeightConfig is deprecated and will no longer be supported in a future release, please use version 2, see https://github.com/pytorch/ao/issues/2649 for more detailsr   r   r  r  r  r  quant_kwargsr   r  )hp_value_lbhp_value_ubr  )r  r  r  r  r  )r  rA  r  r  r  r  r  r)   rj  r   rM   r   r  rL   r]  r   rZ  rG  r&  r'  r?   r  rt   r   r  r   r  rO   r<   r/   ro  )r   r   r  rA  r  r  r  r  r  r  rB  r=  r  r   input_quant_kwargsr  s                   r   8_float8_dynamic_activation_float8_weight_quantize_tensorr!  R  s   .&L$K I 4 40 K(((1<.. zz||v0)<< 	V	B
 B
 	V 	VU	V 	V 
 <?R1$$Q"(<(A(AM )BF##  $f-- 
|u~---P .-- ~ q	
 	
 	
 $FL$57IJJ
::<<1sT*%5%5566J5!% 9555
 
 
 <&< 0
 

 :.=O
 
 
 ~"""$K6>$K$K"""7"++/
 
 
 (/%*/-
 
 
 r   c                <   t                      st                      s
J d            |j        r#t          j        j                                         t          | |          sJ d| dd|  dz               t          | t                    rt          |           } t          t          | |          |          }t          | |t          j                            |d                     t#          j        t'          t(          | j        |          |           | _        | S )	NzPFloat8 dynamic activation quantization is only supported on CUDA>=8.9 and MI300+zKapplying float8 dynamic activation quant requires module to have parameter z
 attributez but z does not have oneFr   r  )rC   rB   r#  r7  r8  r9  r:  r   r   r&   r  r!  r   rw   r   r   r   r   r   r   r   r   r  s       r   2_float8_dynamic_activation_float8_weight_transformr#    sR     (**  Z , ! H"EEGGG6>**  pVdppp
,&
,
,
,	- * &,'' /&v..O''  +5AA  
 ( & 1)	
 	
 	

 	 F Mr   c                   h    e Zd ZU dZ e            Zeed<   eZ	e
j        ed<   eZe
j        ed<   d ZdS )rh   a  
    Applies float8 dynamic quantization to activations and float8 quantization followed by compression to sparse semi-structured tensor to weights of linear layers.

    Args:
        `layout`: layout type for quantized weight tensor, only supports `CutlassSemiSparseLayout` at the moment.
        `activation_dtype`: data type for quantized activation tensor.
        `weight_dtype`: data type for quantized weight tensor.
    r!  r  rA  c                 D    t           j                            d           d S )NzHtorchao.quantization.Float8DynamicActivationFloat8SemiSparseWeightConfigr  r   s    r   r(  zAFloat8DynamicActivationFloat8SemiSparseWeightConfig.__post_init__  s)    $$V	
 	
 	
 	
 	
r   N)r   r   r   r*  r   r!  r#   r,  r%   r  r   r]  r$   rA  r(  rk   r   r   rh   rh     so           -,..FF...$.ek... *L%+***
 
 
 
 
r   rh   c                    t                      s
J d            t          | t                    rt          |           } | j        }|j        }|j        }|j        }t          |t                    st          d| d          t          ||          }t          |t          d|i          }t          j                            |d          | _        t!          j        t$          |           | _        | S )Nz2Float8 quantization is only supported on CUDA>=9.0z;Only CutlassSemiSparseLayout layout is supported. Received rr   r
  r  Fr   )rD   r   r&   r  r   rA  r  r!  r   r  r  rO   r  r   r   r   r   r   r   r   )r   r   r   rA  r  r!  s         r   >_float8_dynamic_activation_float8_semi_sparse_weight_transformr'    s     TT TTT&,'' /&v..]F&L.]Ff566 
!S&SSS
 
 	
 *&,??F+$&67  F H&&vU&CCFM();VDDFMr   c                       e Zd ZU dZej        ed<   eZej	        ed<   eZ
ej	        ed<   dZeeeeeef         f                  ed<    ed          Zee         ed	<   dZeed
<   d ZdS )(Float8StaticActivationFloat8WeightConfiga]  
    Configuration for applying float8 static symmetric quantization to

    Args:
        scale (torch.Tensor): The scale tensor for activation quantization.
        activation_dtype (torch.dtype): The target data type for activation quantization. Default is torch.float8_e4m
        weight_dtype (torch.dtype): The target data type for weight quantization. Default is torch.float8_e4m
        mm_config (Float8MMConfig): Configuration for the matrix multiplication. Default uses fast accumulation.
        set_inductor_config (bool): if True, adjusts `torchinductor` settings to recommended values.
    r|  r  rA  Nr  Tr  r  r#  c                 l    t           j                            d           t          j        d           d S )Nz=torchao.quantization.Float8StaticActivationFloat8WeightConfigz`Float8StaticActivationFloat8WeightConfig` will be moving to prototype in a future release of torchao. Please see https://github.com/pytorch/ao/issues/2752 for more details.r%  r   s    r   r(  z6Float8StaticActivationFloat8WeightConfig.__post_init__  sE    $$K	
 	
 	
 	 |	
 	
 	
 	
 	
r   )r   r   r   r*  r   r   r,  r$   r  r]  rA  r  r
   r   r(   r   r'   r  r#  r-  r(  rk   r   r   r)  r)    s         	 	 <$.ek... *L%+*** 	 neNN$BCCD    +9.*M*M*MIx'MMM $$$$
 
 
 
 
r   r)  rg   c           	         t                      st                      s
J d            t          | t                    rt	          |           } |j        }|j        }|j        }|j        }|j	        }|j
        r#t          j        j                                         | j        }t!          |          \  }}	t          |t"                    s
J d            t%          |          s| S t'          |j        |	          }
t+          ||
|t,          j        t1          |                    }t2          }||d}t5          |||d |          }t,          j                            |d          | _        t;          j        t>          |           | _         | S )	NzMFloat8 static activation quantization is only supported on CUDA 8.9 and abover  r  r  r  )r|  r}  r  Fr   )!rC   rB   r   r&   r  r|  r  rA  r  r  r#  r7  r8  r9  r:  r   r+   rM   r  r?   r   r   r   r  r   r  r@   r   r   r   r   r   r   )r   r   r|  r  rA  r  r  r   r  rB  r=  r  r   r   s                 r   1_float8_static_activation_float8_weight_transformr,  (  s     (**  W , &,'' /&v..LE.&L$K I! H"EEGGG]F1G1T1T..,i88  A 8 &!!  .@AAJ1!My111   8"8, 
 U'   H&&'7u&MMFM();VDDFMr   c                   f    e Zd ZU dZej        ed<   dZeed<   dZ	eed<   dZ
eed<   d	Zeed
<   d ZdS )UIntXWeightOnlyConfiga  
    Configuration for applying uintx weight-only asymmetric per-group quantization to linear layers, using uintx quantization where
    x is the number of bits specified by `dtype`

    Args:
        `dtype`: torch.uint1 to torch.uint7 sub byte dtypes
        `group_size`: parameter for quantization, controls the granularity of quantization, smaller
         size is more fine grained, defaults to 64
        `pack_dim`: the dimension we use for packing, defaults to -1
        `use_hqq`: whether to use hqq algorithm or the default algorithm to quantize the weight
        `set_inductor_config`: if True, adjusts `torchinductor` settings to recommended values.
    r]  @   r   ro   pack_dimFr  Tr#  c                 l    t           j                            d           t          j        d           d S )Nz*torchao.quantization.UIntXWeightOnlyConfigz`UIntXWeightOnlyConfig` will be moving to prototype in a future release of torchao. Please see https://github.com/pytorch/ao/issues/2752 for more details.r%  r   s    r   r(  z#UIntXWeightOnlyConfig.__post_init__v  s=    $$%QRRR i	
 	
 	
 	
 	
r   N)r   r   r   r*  r   r]  r,  r   r+  r0  r  r-  r#  r(  rk   r   r   r.  r.  a  s           ;JHcGT $$$$
 
 
 
 
r   r.  rc   c                    |j         }|j        }|j        }|j        }|j        r#t
          j        j                                         | j	        }ddl
m} t          j        t          j        t          j        t          j        t          j        t          j        t          j        t          j        h}||v sJ d|             t*          j        }	d|f}
|rb|t          j        k    rt.                              d           ||         \  }}t          j        }d }d }t2          j        }d}t7                      }nSd\  }}t          j        t          j                  j        }t          j        }t2          j         }d}tC          ||	          }tE          ||	|
|||||||||
          }t          j#        $                    |d          | _	        tK          j&        tN          |           | _(        | S )Nr   )rS   zUnsupported dtype for hqq: rE   zgRecommended to use `Int4WeightOnlyConfig(group_size, use_hqq=True, version=1)` for the best performanceFNNT)r]  r0  )r  r  r  r  rh  rg  r5  r  r   ))r]  r   r0  r  r#  r7  r8  r9  r:  r   %torchao.quantization.quant_primitivesrS   r   uint1uint2uint3uint4uint5uint6uint7r  rT   r  r  r\  rU   FLOATr   r  r  r  r  rr  r   r   r   r   r   r   r   r   )r   r   r]  r   r0  r  r   rS   SUPPORTED_DTYPESr	  r=  r  r  r  r  rh  rg  r5  rx  s                      r   _uintx_weight_only_transformr>    s    LE"JHnG! H"EEGGG]FMMMMMM 		 $$$$&KE&K&K$$$)LZJ >EKNNy    7u=	9+1--)	9k%-((, ;+/EH===))+#  J H&&z&GGFM();VDDFMr   intx_unpacked_tensor	hp_tensorr  c           	         t          | t                    sJ | j                            |          | _        t          | j                 \  }}t          || j        | j        | j        t          j
        ||          | _        dS )ad  
    Adjusts the scale_dtype on IntxUnpackedToInt8Tensor.
    Updating the scale dtype requires updating the qdata because qdata is calculated after the scale.
    This is used in IntxWeightOnlyConfig and Int8DynamicActivationIntxWeightConfig to make
    version=2 and version=1 numerically equivalent when the scale_dtype differs from the input dtype
    )output_dtyper  r  N)r   r;   r|  rs   rS   r
  rV   r=  r}  r   r  qdata)r?  r@  r  qminqmaxs        r   rp  rp    s     *,DEEEEE!5!;!>!>{!K!K()=)JKJD$!0'"'Z" " "r   c                       e Zd ZU dZej        Zej        ed<    e	d          Z
eed<   ej        Zeed<   dZeej                 ed<    e            Zeed<   ej        Zeed	<   ej        Zeed
<   dZeed<   d ZdS )IntxWeightOnlyConfiga  
    Configuration for quantizing weights to torch.intx, with 1 <= x <= 8.
    Weights are quantized with scales/zeros in a groupwise or channelwise
    manner using the number of bits specified by weight_dtype.
    args:
        `weight_dtype`: The dtype to use for weight quantization.  Must be torch.intx, where 1 <= x <= 8.
        `granularity`: The granularity to use for weight quantization.  Must be PerGroup or PerAxis(0).
        `mapping_type`: The type of mapping to use for the weight quantization.
            Must be one of MappingType.ASYMMETRIC or MappingType.SYMMETRIC.
        `scale_dtype`: The dtype to use for the weight scale.
        `layout`: The layout to use for the packed weight tensor:
            - QDQLayout: this layout is designed for export to ExecuTorch.this layout represents the quantization with Q/DQ quant primitives,
                and is intended for export applications like ExecuTorch.
        `intx_packing_format`: The format to use for the packed weight tensor (version 2 only).
        `intx_choose_qparams_algorithm`: The algorithm to use for choosing the quantization parameters.
        `version`: version of the config to use, only subset of above args are valid based on version, see note for more details.

        Note:

        Current state for IntxWeightOnlyConfig is that it supports both v1 (legacy) and v2.

        * `intx_packing_format` is used for version 2.
        * `layout` is only used for version 1.
    rA  r   r  r	  Nr  r!  rE  rF  r   rG  c                 
   t           j                            d           | j        d t	          dd          D             v sJ d| j                     t          | j        t          t          f          sJ d| j                     t          | j        t                    r'| j        j	        dk    sJ d| j        j	                     | j
        t          j        t          j        t          j        fv sJ d	| j
                     d S )
Nz)torchao.quantization.IntxWeightOnlyConfigc                 >    g | ]}t          t          d |           S rJ  rK  rL  s     r   rO  z6IntxWeightOnlyConfig.__post_init__.<locals>.<listcomp>		  rP  r   rE   rQ  rR  z1granularity must be PerAxis or PerGroup, but got r   ra  zvmapping_type must be MappingType.ASYMMETRIC, MappingType.SYMMETRIC, or MappingType.SYMMETRIC_NO_CLIPPING_ERR, but got )r   r   r   rA  rS  r   r  rJ   rK   rT  r	  rT   r  r  rU  r   s    r   r(  z"IntxWeightOnlyConfig.__post_init__	  sF   $$%PQQQ $T$TaQR$T$T$TTTT^4K\^^ UTT $*Wh,?@@ 	
 	
R@PRR	
 	
@ d&00 	#(A---O8H8MOO .--  "!1%
 
 
 

 Y  FJ  FW  Y  Y
 
 
 
 
r   )r   r   r   r*  r   r  rA  r]  r,  rJ   r  rI   rT   r  r	  r  r
   r   r!  r#   r:   r^  rE  r8   r_  rF  rG  r+  r(  rk   r   r   rG  rG    s          2 !&
L%+***&wqzzK))) + 5L+555)-K%+&---Y[[FF   ->-O*OOO") "#=    GS
 
 
 
 
r   rG  c                   |j         }|j        }|j        }|j        }|j        }|j        }	|j        }
|                                 dk    rd}n?|                                 dk    rd}n$t          d|                                            t          |t                    r|j        }nRt          |t                    r+|j        dk    sJ d|j                     | j        |         }nt          d|           |                                 dk    rd|f}n |                                 dk    sJ d|ddf}|j        dk    r|j        t           j        k    rq|4|j        t&          j        k    r|                    t&          j                  }t/          j        | ||||||
	          }||| j        k    rt3          || |           |S t          d
|	           |j        t4          j        k    s
J d            |j        dk    sJ t9          j        d           t<          |         \  }}t?          | ||t&          j        |||t&          j        |t@          j!        k    tD          j#        |          } | S )Nr   ro   r  rE   z>IntxWeightOnlyConfig only works for 2-d and 4-d Tensors, got: r   ra  z-granularity must be PerGroup or PerAxis, got )r	  r/  r0  rF  rb  z(version 1 only supports affine algorithmzConfig Deprecation: version 1 of IntxWeightOnlyConfig is deprecated and will no longer be supported in a future release, please use version 2, see https://github.com/pytorch/ao/issues/2967 for more detailsre  )$rA  r  r	  r  r!  rE  rF  rj  r   r   rK   r   rJ   rT  r   rG  r:   r^  r]  r   r  rs   r  r;   ro  rp  r8   r_  r&  r'  rS   r   rT   r  rU   rr  )r   r   r/  r0  rA  r  r	  r  r!  rE  rF  	input_dimr   r=  rx  r  r  s                    r   !_intx_weight_only_quantize_tensorrL  	  s    &L$K&L$K]F 4$*$H!zz||q							[VZZ\\[[
 
 	
 +x(( X +

	K	)	) X1$$$FK4DFF %$$ \),

VVVWWWzz||q_

 zz||q    Q*
~%):)KKK ,1B1HEK1W1W$5$8$8$D$D!19))"3.K  J &;&,+F+F;   Q<OQQRRR /3M3TTTT2 UTT >QM 	X   3<@Iy%!Z#{'<<)-  F Mr   c                   t          | d          s
J d            t          | j        |||          }t          j                            |d          | _        t          | t          j                  r t          j	        t          |           | _        n9t          | t          j                  rt          j	        t          |           | _        | S )Nr   zgapplying intx weight only quant requires module to have weight attribute but {module} does not have oner.  Fr   )r   rL  r   r   r   r   r   r   r   r   r   r   	Embeddingr   )r   r   r/  r0  rx  s        r   _intx_weight_only_transformrO  u	  s     68$$  	, $ 3!+	  J H&&z&GGFM&")$$ L!,-?HH	FBL	)	) L!,-BFKKMr   c                   <    e Zd ZU dZeed<   eed<   dZeed<   d ZdS )FPXWeightOnlyConfiga  Sub-byte floating point dtypes defined by `ebits`: exponent bits and `mbits`: mantissa bits
    e.g. fp6_e3_m2, fp6_e2_m3, ...
    The packing format and kernels are from the fp6-llm paper: https://arxiv.org/abs/2401.14112
    github repo: https://github.com/usyd-fsalab/fp6_llm, now renamed to quant-llm
    For more details for packing please see: :class:`~torchao.dtypes.fpx.FpxTensorCoreAQTTensorImpl`

    This is experimental, will be merged with `to_affine_quantized_floatx`
    in the future
    ebitsmbitsTr#  c                 l    t           j                            d           t          j        d           d S )Nz(torchao.quantization.FPXWeightOnlyConfigz`FPXWeightOnlyConfig` will be moving to prototype in a future release of torchao. Please see https://github.com/pytorch/ao/issues/2752 for more details.r%  r   s    r   r(  z!FPXWeightOnlyConfig.__post_init__	  s=    $$%OPPP g	
 	
 	
 	
 	
r   N)	r   r   r   r*  r+  r,  r#  r-  r(  rk   r   r   rQ  rQ  	  sT           JJJJJJ $$$$
 
 
 
 
r   rQ  rd   c                    |j         }|j        }| j        }|j        r#t          j        j                                         t          | t                    rt          |           } ddlm} ddlm} |                                dk    sJ d|                                             |j        \  }}|dz  dk    s	|dz  dk    r2t"                              d||z   d	z    d
| d
| d| d| d           | S  |||          }	 |||	          }
t&          j                            |
d          | _        t-          j        t0          |           | _        | S )Nr   )to_affine_quantized_fpx)FloatxTensorCoreLayoutr   z'floatx only works for 2-d Tensor, got: r/     z"Skipping floatx quantization floatrE   r   z= because the shape is not compatible with the kernel: in_dim=z
, out_dim=z1 expected in_dim % 64 == 0 and out_dim % 256 == 0Fr   )rR  rS  r   r#  r7  r8  r9  r:  r   r&   r  r  rV  torchao.prototype.dtypes.floatxrW  rj  r   r  r  r   r   r   r   r   r   r   )r   r   rR  rS  r   rV  rW  r	  r
  r5  rx  s              r   _fpx_weight_only_transformrZ  	  s    LELE]F! H"EEGGG&,'' /&v..666666FFFFFF::<<1V

VVlOGVqgmq00?1B ? ?U ? ?U ? ?CI? ?U\? ? ?	
 	
 	

 $$UE22G((99JH&&z&GGFM();VDDFMr   c                       e Zd ZU dZ ee          Zeee	e
         f         ed<    ee          Zeee	e
         f         ed<   dZeed<   d Zd Zd	S )
r   a  Configuration class for applying different quantization configs to modules or parameters based on their fully qualified names (FQNs).

    Args:
        `fqn_to_config`: typing.OrderedDict[str, Optional[AOBaseConfig]]: an
         ordered dictionary from
             (1). fully qualified name (fqn) of module or parameter
             (2). regex of fully qualified name (in python `re` module regex format), should
                  start with prefix "re:" or
             (3). "_default"
         to the config that we want to apply to the module/param or None

         Config key ordered by precedence:
           * fully qualified parameter name, e.g. `language.layers.0.q_proj.weight`
           * fully qualified module name, e.g. `language.layers.0.q_proj`
           * regex for parameter names, must start with `re:`, e.g. `re:language\.layers\..+\.q_proj.weight`.
             The first regex that matches will be applied.
           * regex for module names, must start with `re:`, e.g. `re:language\.layers\..+\.q_proj`,
             whichever regex fully matches the module fqn first will be applied
             (order of keys for dictionary are kept consistent since we are using OrderedDict)
           * "_default", fallback if no match for all previous keys
             (Note, when using `_default`, the config is applied to all modules, to apply
              it to only a subset of modules, e.g. with some types, it's better to filter
              the modules that we don't want to quantize before hand and configure them to
              None, e.g. `{"re:.+norm.+": None, "_default": linear_config}`) "_default" is not supported when filter_fn is not specified.
        `module_fqn_to_config`: typing.OrderedDict[str, Optional[AOBaseConfig]]: To maintain BC with ModuleFqnToConfig, to be deprecated later
        `version`: int: Version of config to use.

    Note:
        - The order of patterns in the OrderedDict may matter as only the first matching pattern is applied
        - "_default" is ignored for parameter replacement.
    )default_factoryr   module_fqn_to_configrE   rG  c                    t           j                            d           t          | j                  dk    r7t          | j                  dk    r| j        | j        k    rt          d          t          | j                  dk    r$t          | j                  dk    r| j        | _        t          | j                  dk    r$t          | j                  dk    r| j        | _        d| j        v rt          j        d           d S d S )Nz torchao.quantization.FqnToConfigr   zP`fqn_to_config` and `module_fqn_to_config` are both specified and are not equal!r   zConfig Deprecation: _default is deprecated and will no longer be supported in a future release. Please see https://github.com/pytorch/ao/issues/3229 for more details.)	r   r   r   r   r   r]  r   r&  r'  r   s    r   r(  zFqnToConfig.__post_init__	  s   $$%GHHH "##a''D-..22"d&???b  
 t())A--#d6H2I2IQ2N2N!%!:Dt!""Q&&3t/H+I+IQ+N+N(,(:D% +++M y     ,+r   c                 x    d                     dgd | j                                        D             d          S )N
zFqnToConfig({c              3   .   K   | ]\  }}d | d| dV  dS )z  'z':
    ,Nrk   )rM  keyvalues      r   	<genexpr>z&FqnToConfig.__str__.<locals>.<genexpr>
  sM        "U 0#//u///     r   z}))r   r   itemsr   s    r   __str__zFqnToConfig.__str__
  sZ    yy &*&8&>&>&@&@   	
 	
 		
r   N)r   r   r   r*  r   r   r   OrderedDictTyper  r
   r   r,  r]  rG  r+  r(  rg  rk   r   r   r   r   	  s          @ CH%#C C CM?3(>#>?    JO#J J J/#x/E*EF    GS  2

 

 

 

 

r   r   fqnc                    d}g }t          t          |                                                     D ]N\  }\  }}|t          |           v r5t	          |          dk    r| d| n|}|                    ||||f           Ot          |          D ]\  }}}}||j        v rd}|j        |         }	|	|                    |           7t          t          |	                   }
t          |	          t          v r |
| |	|          } vt          t          |	           d          |s@||j        v r7|j        |         }	|	&t          t          |	                   }
 |
| |	          S | S |D ]\  }}}}|j        D ]}|                    d          rt          j        |d	d         |          rod}|j        |         }	|	^t          t          |	                   }
t          |	          t          v r |
| |	|          } t          t          |	           d          |ss|j        D ]k}|                    d          rTt          j        |d	d         |          r7|j        |         }	|	(t          t          |	                   }
 |
| |	          c S l|sC|j                            d
d          }	|	&t          t          |	                   }
 |
| |	          S | S )aw  This function expects a module that either is specified in FqnToConfig or has a parameter that is specified in FqnToConfig.

    Args:
        module (torch.nn.Module): The module to be processed.
        fqn (str): The fully qualified name of the module containing the parameters.
        config (FqnToConfig): Configuration object containing regex patterns / fqn mapped
            to quantization configurations.

    Returns:
        torch.nn.Module: The modified module with quantized parameters.

    Raises:
        NotImplementedError: If the quantization configuration is not yet supported for parameter quantization.
    Fr   rr   TNr  zs does not yet support parameter quantization! Please see https://github.com/pytorch/ao/issues/3252 for more detailsre:r   r   )	enumeratert   named_parametersdirr   r   r   r   r=   r   +CUSTOM_PARAM_QUANTIZATION_SUPPORTED_CONFIGSr  
startswithre	fullmatchget)r   ri  r   parameter_config_foundtop_level_paramsir   paramparameter_fqncr   patterns               r   r   r   +
  sv   & #&/V5L5L5N5N0O0O&P&P O O""NES[[((-0XX\\3)))))~  ##Q}$MNNN 488H3I3I  />5-F000%)"$]3Ay $$Q''''2477;77III$WVQ~NNNFF-77  X  X  X   1" " cV-A&A&A %=.tAww7G761%%%M 4D  />5-+ 	 	G!!%(( R\'!""+}-U-U )-&(1=6tAww?GAww"MMM!(>!R!R!R1#Aww  \  \  \  	 " .+ 	. 	.G!!%(( .R\'!""+s-K-K .(1=6tAww?G"761----- " & $$Z66=.tAww7G761%%%Mr   c                     | |j         v r%|                     d          rJ d|  d            dS |j         D ]7}|                    d          r t          j        |dd         |           r dS 8dS )ae  Check if a given fqn matches the exact fqn or regex pattern specified in FqnToConfig.

    Args:
        fqn (str): The fully qualified name of the module.
        config (FqnToConfig): Configuration object containing regex patterns or raw FQNs for quantization.

    Returns:
        bool: True if the fqn is specified in FqnToConfig. False otherwise.
    rk  zError: Exact match but regex z specified.Tr   NF)r   rp  rq  rr  )ri  r   !maybe_module_or_param_fqn_patterns      r   r   r   
  s     f""">>%(( 	
 	
<C<<<	
 	
( t171E 	 	-0;;EBB r|1!""5sH H  tt5r   c                     |                                  D ]E\  }}|t          |           v r/t          |          dk    r| d| n|}t          ||          r dS FdS )a  Check if a given module contains top-level parameters that match the exact fqn or regex pattern specified in FqnToConfig.

    Args:
        module (nn.Module): The module to be checked.
        fqn (str): The fully qualified name of the module.
        config (FqnToConfig): Configuration object containing regex patterns or raw FQNs for quantization.

    Returns:
        bool: True if the module contains top-level parameters that match the fqn or regex pattern specified in FqnTo
    r   rr   TF)rm  rn  r   r   )r   ri  r   r}   rw  rx  s         r   r   r   
  st     ..00  e3v;;/23xx!||sOOTOOOM%mV<< tt5r   c                     t          j        d          5  t          j        | j        | j                  }ddd           n# 1 swxY w Y   | j        |_        | j        |_        |S )aY  
    Unwrap a torchao Float8Linear by returning a nn.Linear with the same weights and bias.

    Torchao inference quantization techniques are generally only applicable to nn.Linear
    layers, so this helper is useful for unwrapping models trained with torchao float8 training,
    which replaces nn.Linear layers with Float8Linear layers.
    metaN)r   rq   r   r   r  out_featuresr   r   )r   
new_modules     r   r  r  
  s     
f		 H HYv163FGG
H H H H H H H H H H H H H H HJkJOs    AAA)rj   Nrk   )Fr   r3  )r*  r[  rq  r   r&  collectionsr   dataclassesr   r   	functoolsr   typingr   r   r	   r
   r   r   rh  r   torch.nnr   torch.nn.utils.parametrizer9  r   r7  torchao.core.configr   r  r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r    Mtorchao.dtypes.uintx.packed_linear_int8_dynamic_activation_intx_weight_layoutr!   r"   torchao.dtypes.utilsr#   torchao.float8.configr$   r%   torchao.float8.float8_linearr&   torchao.float8.inferencer'   r(   r)   r*   r+   =torchao.quantization.linear_activation_weight_observed_tensorr,   torchao.quantization.observerr-   %torchao.quantization.quantize_.commonr.   (torchao.quantization.quantize_.workflowsr/   r0   r1   r2   r3   r4   r5   r6   r7   r8   r9   r:   r;   r<   %torchao.quantization.transform_moduler=   r>   torchao.quantization.utilsr?   Atorchao.quantization.weight_tensor_linear_activation_quantizationr@   torchao.utilsrA   rB   rC   rD   rG   rF   GPTQrH   r  rI   rJ   rK   rL   rM   "linear_activation_quantized_tensorrN   rO   linear_quant_modulesrP   rQ   qatrR   quant_primitivesrS   rT   rU   rV   unifiedrW   rX   rY   	getLoggerr   r  __all__r<  rr  r  r  rv   r   r[   rZ   r   r  r-  r   r   r   r   r   r   r   Devicer\   r  r  r  r  r]   r>  r@  r  r  r  r  r  r  re   r  r  r`   r  r  r  r  r  ra   r  r  r  r  r;  r<  r]  r  r  r  r^   r  r  r_   r  rb   r  r  r  r  r  rf   r!  r#  rh   r'  r)  rg   r,  r.  rc   r>  rp  rG  rL  rO  rQ  rd   rZ  r   ri   ro  r   r   r   r   r  serializationadd_safe_globalsrk   r   r   <module>r     s  	 	  				   # # # # # # ( ( ( ( ( ( ( (       > > > > > > > > > > > > > > > > 1 1 1 1 1 1        0 0 0 0 0 0 0 0 0 0 0 0  , , , , , ,                                         *        ( ' ' ' ' ' 8 8 8 8 8 8 8 8 5 5 5 5 5 5                   F E E E E E                                             6 5 5 5 5 5                 > = = = = = = =                                                 1 0 0 0 0 0 0 0 , , , , , ,		8	$	$  6 O12,-O)*O)?+>?	  555	  ,.+ + sCx)+ 
+ + + +\  &   D   J CGN N N9N89N 9:N
 %(/3!7!=>?N N N Nb-u| - - - - @ @ @D D D
0 
0 
0 ).e    . CM+/	F
 F
8?F
F
 %(/3!7!=>?F
 U['(	F
 F
 F
 F
R5< EL    $EL U\    ,%, 5<    & 
 
 
 
 
L 
 
 
< '@&?)+P' ' #
 "!"GHH
 ,004L L LHOL1L 5<(	L
  -L L L IHL^ \ \ \ \ \L \ \ \H ,004M M M
 5<(M  -M M M M` "!"GHH
 ,004  HO1 5<(	
  - X_   IH, 
 
 
 
 
L 
 
 
2 '@&?)+P' ' #
 "!"GHHHO%J
X_   IH: 
 
 
 
 
< 
 
 
< 65!=  
 "!">??HO%A   @?> )R )R )R )R )R< )R )R )R\ -,-?AUVV z z zz "!"677HO%9
X_   87  
; 
; 
; 
; 
;l 
; 
; 
; "!"IJJHO%L
X_   KJ2     <   6 -,-?AUVV   6 "!"677
 #	  HO  	   87@     $
|

\
 
 
 
.	 	 	 	 	 	     
|
+
 \
 
 
 

|
+
 lEL!
 
 
 
 
 
 
 
 
L 
 
 
> '@&?)+P' ' #
J J JZ "!"GHHHO%J
X_   IH$L L L T T T T T\ T T T, /.0  
  0 "!"899
 #	" " "HO""" 	"
 X_" " " :9"R %))-& &|&*& k& EL!	&
 && & & &R5< D    6 2S 2S 2S 2S 2S 2S 2S 2Sl +D*C-/X+ + '
T T Tn "!"KLL
 #	" " "HO"5" 	" " " ML"J 
 
 
 
 
, 
 
 
( "!"UVVHO%X   WV< 
 
 
 
 
| 
 
 
> *C)B,.V* * &
 "!"JKK5HO5%M5 5 5 LK5p 
 
 
 
 
L 
 
 
8 .-.  
 "!"788AHOA%:A A A 98AH2|  
	   2 7
 7
 7
 7
 7
< 7
 7
 7
| ,004V V V 5<(	V
  -V V V Vr "!"677
 ,004  HO  5<(	
  - X_   876 
 
 
 
 
, 
 
 
. ,+,=?RSS "!"566HO%8
X_   76@ M
 M
 M
 M
 M
, M
 M
 M
b    ./ +YHOY	Y Y Y Y Yx	   6I	    0, 29       $ $#0(  $	    r   