
    Pi$              	          d dl Z d dlmc mZ d dlmZ d dlZd dlm	Z	m
Z
mZmZmZmZ d dlmZ d dlmZ d dlmZ d dlmZ d dlmZmZ d d	lmZmZ d d
lmZmZm Z m!Z! ddl"m#Z#m$Z$ g dZ%e j&        j'        Z'i Z(d Z)d Z* G d de j+                  Z, e j-                    d             Z.d Z/de j+        de j+        fdZ0de j+        de j+        fdZ1de j+        de j+        fdZ2 G d d          Z3 G d de3e          Z4 G d de4          Z5 G d d e	e3          Z6 G d! d"e6e3          Z7 G d# d$e6e3          Z8 G d% d&ee3          Z9 G d' d(e9          Z: G d) d*e9          Z; G d+ d,e9          Z< G d- d.e9          Z= G d/ d0ee3          Z> G d1 d2e>          Z? G d3 d4e>          Z@ G d5 d6e>          ZA G d7 d8e j+        e3          ZB G d9 d:e          ZCeCD                    e j        j        jE                  eCF                    e'jE        jG                  d;                         ZHeCF                    e'jI        jG                  d<             ZHeCF                    e'jJ        jG                  d=             ZHeCF                    e'jK        jG                  d>             ZH G d? d@eC          ZL G dA dBeC          ZM G dC dDeCe3          ZN G dE dFeLe3          ZO G dG dHeMe3          ZP G dI dJe	e3          ZQ G dK dLe3e          ZR G dM dNe3e          ZSeBe6e7e4gZTeBe4e:gZUeBe4e?gZVeNeOePgZWeBeQeReSgZXeBe=e5gZYeTeUz   eWz   ZZeZe?gz  ZZ e            reZeQeSgz  ZZ e             reZeRgz  ZZ e            seZeYz  ZZ e[ e\eZ                    ZZdO Z]	 dVdQZ^ e j-                    deTddRdSgdTdPdPdfdU            Z_e j`        a                    eZ           e j`        a                    e0e1e2g           dS )W    N)return_and_correct_aliasing)AffineQuantizedTensorFloat8LayoutMarlinSparseLayoutPlainLayoutSemiSparseLayoutTensorCoreTiledLayout)Layout)Float8MMConfigsafe_int_mm)LinearActivationQuantizedTensor)MappingTypeZeroPointDomain)%_quantize_activation_per_token_absmaxcompute_error)TorchAOBaseTensoris_sm_at_least_89is_sm_at_least_90torch_version_at_least   )PerRow	PerTensor)	AutoQuantizableLinearWeight	autoquantDEFAULT_AUTOQUANT_CLASS_LIST!DEFAULT_INT4_AUTOQUANT_CLASS_LIST!GEMLITE_INT4_AUTOQUANT_CLASS_LIST"DEFAULT_FLOAT_AUTOQUANT_CLASS_LIST#DEFAULT_SPARSE_AUTOQUANT_CLASS_LISTOTHER_AUTOQUANT_CLASS_LISTALL_AUTOQUANT_CLASS_LISTc                 @    t                               | f|z   d           S N)_AUTOQUANT_CACHEget)clsshapes_and_dtypes     r/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/torchao/quantization/autoquant.py_check_cacher*   ?   s     )9 94@@@    c                 "    |t           | f|z   <   d S r$   )r%   )r'   r(   ress      r)   _update_cacher.   C   s    25cV..///r+   c                      e Zd ZdZeddgddd            ZddgdddZd Zed             Zd	 Z	 e
j                    d
             Zd Zd Ze	 dd            Zed             Zedd            Zed             ZdS )r   a  
    A subclass of torch.Tensor that, when run, finds the best type of quantization for itself and swaps
    its data with the quantized version.

    Args:
        weight (torch.Tensor): The initial weight tensor.
        qtensor_class_list (list): A list of tensor classes to be considered for quantization.
        *args: Additional positional arguments.
        mode (list, optional): A list containing mode settings for quantization. The first element is the mode type
                               (e.g., "relu"), and the second element is the mode value (e.g., None). Defaults to ["relu", None].
        **kwargs: Additional keyword arguments.
    reluN)modemin_sqnrc                Z   |j         |d<   |                    dd          r|                    d          n|j        |d<   |                    dd          r|                    d          n|j        |d<   d|d<   |                    d|j                  }t          j        j        | |fi |S )NdevicelayoutFdtyperequires_gradshape)	r4   r&   r5   r6   popr8   torchTensor_make_wrapper_subclass)r'   weightqtensor_class_listr1   r2   argskwargsr8   s           r)   __new__z#AutoQuantizableLinearWeight.__new__V   s     "=x$*JJx$?$?RFJJx   V] 	x $*::gu#=#=OFJJw6< 	w #(

7FL11|23HHHHHr+   c                L    || _         || _        i | _        || _        || _        d S r$   )r=   r>   logged_datar1   r2   )selfr=   r>   r1   r2   r?   r@   s          r)   __init__z$AutoQuantizableLinearWeight.__init__k   s-     "4	 r+   c                 p    | j         j         d| j         d| j         d| j         d| j         d| j         dS )Nz(data=z, shape=z	, device=z, dtype=z, qtensor_class_list=))	__class____name__r=   r8   r4   r6   r>   rD   s    r)   __repr__z$AutoQuantizableLinearWeight.__repr__z   su    ~& g gdk g g4: g gkg g+/:g gLPLcg g g	
r+   c                 4   |                      d| j        d                   } | j        }| j        |j        |d n|j        f}||fz   }d|j                            |d          z   |j        |<   |j        D ]#}t          ||          t          ||d            $d S )Nr   r   )reshaper8   r6   rC   r&   r>   r*   r.   )act_matw_autoquantbiaslogged_dtypelogged_shapesr(   q_clss          r)   	log_shapez%AutoQuantizableLinearWeight.log_shape   s    //"gmB&788}MLDDdj

 )L?:458O8S8Sa9
 9
 5
 01 !3 	= 	=EE#344<e%5t<<<	= 	=r+   c           
      .   |\  }}}}t          ||          |t          j                    5  t          j        ||| j                  }|d nt          j        ||| j                  }		 t
                              || j        |	          }
|                    ||                    | j                  |	          }| j	        Et          ||
          x}| j	        k     r*t          d| d| j	         d|            t          j        }n#|                    || j        |	|| j                  }n@# t          $ r3}t          d|j         d| d|            t          j        }Y d }~nd }~ww xY wt#          |||           d d d            d S # 1 swxY w Y   d S d S )N)r6   r4   zskipping q_cls: z7 because the sqnr is too small, minimum expected sqnr: z, got zwarning: failed to autoquant z for shape: z due to )r*   r:   no_gradrandnr4   AQDefaultLinearWeight_quantized_linear_opr=   
from_floatr2   r   printinf_autoquant_testr1   	ExceptionrI   r.   )rD   rT   r(   	best_time	act_shapew_shape
bias_shape	act_dtyperO   rQ   
ref_outputq_outputsqnrr-   es                  r)   tune_autoquantz*AutoQuantizableLinearWeight.tune_autoquant   sO   4D1	7J	/008  <  <+iyUUU ") DZyUUU 
$!6!K!Kd" "J  %99!1!1$+!>!>   H 1%28Z%H%HHT-( (  Iu  I  Imqmz  I  I  CG  I  I   $i#33#T[$	49  ! $ $ $qqqTdqqnoqq    )CCCCCC	$
 e%5s;;;A <  <  <  <  <  <  <  <  <  <  <  <  <  <  <  <  <  < 98s<   =F*B?D*)F*
E'4)E"F"E''FFFc           
      \   |r| j         i k    rt          d          | j         i k    r#|s!t                              | j                  } | S d}d}dd}t
          j        }d }| j        D ]
}d}	d}
 || d          }| j                                         D ]\  }}t          ||          h|rd} || d           t          ||          }|t
          j        n|}| 
                    |||           d}t
          j                                         |	t          ||          |z  z  }	|
|z  }
|	|
z  }	|#|dk    r|rt          d|	d	d
| d|d	d           |	t
          j        k    r
||	k    r|	}|}|rt          d| d           |t          }|                    | j                  } | S )Nz?must run module normally to get shape, dtype info for autoquantFTc                     d}| j                                         D ](\  }}|dz  }|r|\  }}}}t          d| d|            )|rt          d| d| d|            |S )Nr   r   zactivation_shapes: z, times_seen: zweight_shape: z	, dtype: z, bias_shape: )rC   itemsr\   )	rD   do_printdiffere_shape_countr(   
times_seenra   weight_shaperc   r6   s	            r)   count_shapesz>AutoQuantizableLinearWeight.to_quantized.<locals>.count_shapes   s    "#040@0F0F0H0H W W, *#q(# WAQ>I|ZU	UUUUVVV ]\]]E]]Q[]]   '&r+   r   )rm   r   z>time (all shapes): z0.4fms for z, prev_best: msz	best_cls=
T)rC   RuntimeErrorrY   r[   r=   r:   r]   r>   rl   r*   ri   _dynamoresetr\   )rD   error_on_unseenr@   ran_new_benchmarksprint_shape_oncerq   r`   best_clsrT   cur_time
total_seenshape_countr(   ro   time_for_best_shapes                  r)   to_quantizedz(AutoQuantizableLinearWeight.to_quantized   sZ    	t/255Q   "$$o$(33DK@@DK #	' 	' 	' 	' I	, 	! 	!EHJ&,te<<<K040@0F0F0H0H ) ), *'788@' :+0($TD9999*6xAQ*R*R' /6 		0 (
 ''/?ATUUU)-&M'')))L0@AAJNNj(

*,H&;???Q?g8gggggU^gggg   59$$h)>)>$	  	,*h***+++,H ""4;//r+   c                 z    |                       || j                  | j        | j        | j        | j                  S )N)r6   r1   r2   )rH   r=   r>   r6   r1   r2   rD   fns     r)   _apply_fn_to_dataz-AutoQuantizableLinearWeight._apply_fn_to_data  s?    ~~Bt{OO#*]  
 
 	
r+   c                 H    dg| j         | j        | j        | j        | j        gfS Nr=   )r>   r1   r2   r6   r8   rJ   s    r)   __tensor_flatten__z.AutoQuantizableLinearWeight.__tensor_flatten__
  s0    z#IMJJ
 
 	
r+   c           	      N    |d         }|\  }}}}	}
 | ||||||
n||	|          S )Nr=   )r1   r2   r8   r6   strides )r'   tensor_data_dicttensor_attributes
outer_sizeouter_strider=   r>   r1   r2   r6   r8   s              r)   __tensor_unflatten__z0AutoQuantizableLinearWeight.__tensor_unflatten__  sV     "(+;L8D(E5s%-%%: 
 
 
 	
r+   c                      | ||fi |S r$   r   )r'   r=   r>   r@   s       r)   r[   z&AutoQuantizableLinearWeight.from_float#  s    s6-88888r+   r   c                    |i n|}|t           j        j        j        u rV|d         |d         t	          |          dk    r|d         nd }}}|                     |||            |||j        |          S 	 t           j                                        5   ||i |cd d d            S # 1 swxY w Y   d S # t          $ r t          d|            Y d S w xY w)Nr   r      z ERR: subclass doesn't implement )r:   nn
functionallinearlenrU   r=   _CDisableTorchFunctionSubclassr_   r\   )r'   functypesr?   r@   mat1rP   rQ   s           r)   __torch_function__z.AutoQuantizableLinearWeight.__torch_function__'  sK   ~658&---QQt99q==Qd  $+D
 MM$T2224k0$777	=6688 - -tT,V,,- - - - - - - - - - - - - - - - - - 	= 	= 	=;T;;<<<<<<	=s6   6B6 B)B6 )B--B6 0B-1B6 6CCc                     |t           j        j        u r5t          ||||d                             t
          j                            S d S Nr   )atendetachdefaultr   r   r:   )r'   r   r   r?   r@   s        r)   __torch_dispatch__z.AutoQuantizableLinearWeight.__torch_dispatch__9  sI    4;&&&.dFDG$=$=el$K$K   '&r+   )NN)r   N)rI   
__module____qualname____doc__staticmethodrA   rE   rK   rU   ri   r:   rW   r   r   r   classmethodr   r[   r   r   r   r+   r)   r   r   H   sq          d^I I I I \I2 d^! ! ! ! !
 
 
 = = \= #< #< #<J U]__H H _HT
 
 

 
 
 PT
 
 
 [
 9 9 [9 = = = [="   [  r+   r   c                 P  	 ddl m} |                    dd          }|                    dd          }t          j                    5  t          j                                         t          j                                        }|                    t          j        	                                           t          j        
                    |          5   | |i | ddd           n# 1 swxY w Y   |                                 t          j        	                                                    |           t          j                                         t          j                                        	t          j                            	|          5   | |i | ddd           n# 1 swxY w Y   t          d	          r.dd
lm} |                    	fd||d          } ||          }n|                    	fd||d          }ddd           n# 1 swxY w Y   |S )zL
    runs benchmark op(*args, **kwargs) avoiding torch.compile overhead
    r   )benchmarkerrepd   warmup   N)streamz	2.9.0.dev)medianc                  ,                                      S r$   replaygraphs   r)   <lambda>z$do_autoquant_bench.<locals>.<lambda>Z       r+   all)r   r   return_modec                  ,                                      S r$   r   r   s   r)   r   z$do_autoquant_bench.<locals>.<lambda>_  r   r+   r   )$torch._inductor.runtime.benchmarkingr   r9   r:   rW   cudasynchronizeStreamwait_streamcurrent_streamr   	CUDAGraphr   r   
statisticsr   benchmark_gpu)
opr?   r@   r   r   r   r   r   r-   r   s
            @r)   do_autoquant_benchr   A  s   
 A@@@@@
**UC
 
 CZZ"%%F	  
   ""$$5:4466777Zv&& 	  	 B	  	  	  	  	  	  	  	  	  	  	  	  	  	  	 
!!##//777
   
$$&&ZeF33 	  	 B	  	  	  	  	  	  	  	  	  	  	  	  	  	  	 !+.. 
	))))))++&&&&v3E ,  C &++CC++&&&&v3H ,  C)              . Js]   BH	C)H)C-	-H0C-	1B%H	F+H+F/	/H2F/	3AHH"Hc                     t          | t                    r<| d         dk    r0t          |           dk    rt          | d         t                    rdS dS )Nr   interpolater   r   TF)
isinstancelistr   floatr1   s    r)   _is_interpolate_moder   d  sT    4G}$$IINNtAw&&  t5r+   xreturnc                 @    |                      t          j                  S r$   )tor:   float16r   s    r)   _to_float16r   o  s    44r+   c                 @    |                      t          j                  S r$   )r   r:   bfloat16r   s    r)   _to_bfloat16r   s  s    44r+   c                     | S r$   r   r   s    r)   	_identityr   w  s    Hr+   c                   0    e Zd ZdZeddgfd            ZdS )AQMixina  
    Tests and benchmarks the autoquantization process for the given activation matrix, weight, and bias.

    Args:
        act_mat (torch.Tensor): The activation matrix.
        weight (torch.Tensor): The weight tensor.
        bias (torch.Tensor or None): The bias tensor.
        best_time (float): The best time to beat for the quantization process.
        mode (list, optional): A list containing mode settings for quantization. The first element is the mode type
                                (e.g., "relu"), and the second element is the mode value (e.g., None). Defaults to ["relu", None].

    Returns:
        float: The benchmarked time for the autoquantization process.
    r0   Nc           	      p                          |          }t          |          rt          j         j        d          }n fd}t          j        |d          }t          ||||dd          }	|	|dz  k     r t          ||||dd          }
|
d	z  |	d
z  z   }	t          d|	dd  d|dd           |	S )Nmax-autotune-no-cudagraphsr   c                 z    t          j                            t          j        |           ||                    S r$   )Fr0   rZ   )abcr'   s      r)   r   z)AQMixin._autoquant_test.<locals>.<lambda>  s,    16#*B*B16!99aQR*S*S#T#T r+   r   r   )r   r   g?i  g?g?>>time: 0.3frr   z, to_beat: zms )r[   r   r:   compilerZ   r   r\   )r'   rO   r=   rQ   r`   r1   	w_qtensorq_c_opr   r-   res2s   `          r)   r^   zAQMixin._autoquant_test  s    NN6**	%% 	L](/K  FF UTTTD]4.JKKKF )T"RUVVVS  %D  D *sSy(CMMMM#MM)MMMMNNN
r+   )rI   r   r   r   r   r^   r   r+   r)   r   r   {  sE          EKTN    [  r+   r   c                   t     e Zd ZU dZ e            Zeed<   e fd            Z	eddgf fd	            Z
 xZS )&AQInt8DynamicallyQuantizedLinearWeightzI
    AutoQuantizable version of Int8DynamicallyQuantizedLinearWeight
    	aq_layoutc           	         |                                 dk    r|S ddlm} ddlm} |}t
          j        }d }t          j        }t          j	        t          j
                  j        }t          j        }	| j        }
 ||          } |||||||	|
          }t          t          |                               ||          }|S )Nr   r   to_affine_quantized_intx)(_int8_symm_per_token_reduced_range_quantc                      d| j         d         fS Nr   r8   r   s    r)   get_weight_block_sizezPAQInt8DynamicallyQuantizedLinearWeight.from_float.<locals>.get_weight_block_size      qwqz?"r+   )epszero_point_dtype_layout)dimtorchao.dtypesr   torchao.quantization.quant_apir   r   	SYMMETRICr:   int8finfofloat32r   int64r   superr   r[   )r'   r=   r   r   input_quant_funcmapping_typer   target_dtyper   r   r   
block_sizerH   s               r)   r[   z1AQInt8DynamicallyQuantizedLinearWeight.from_float  s   ::<<1M 	<;;;;;	
 	
 	
 	
 	
 	

 D #,	# 	# 	# zk%-((, ;-**622
))-
 
 
 =sCCNN$
 
 r+   r0   Nc           	      `   t          |          s%t                                          |||||          S |d         }|                     |          }t	          |                    d|j        d                             \  }}	d }
t          j        |
d          }t          j	                    5  |j
        j        j                                                                        }t          |||	                    dd          |          }ddd           n# 1 swxY w Y   t!          d|dd	|  d
|dd           ||k    r|S ||d|z
  z  ||z
  z  z   }t                                          ||||          }||z
  ||z
  z  }||z  d|z
  |z  z   }t!          d|dd	|  d|d           |S )a  
        Tests and benchmarks the autoquantization process with special handling for interpolate mode.

        Args:
            act_mat (torch.Tensor): The activation matrix.
            weight (torch.Tensor): The weight tensor.
            bias (torch.Tensor or None): The bias tensor.
            best_time (float): The best time to beat for the quantization process.
            mode (list, optional): A list containing mode settings for quantization. The first element is the mode type
                                   (e.g., "relu"), and the second element is the mode value (e.g., None). Defaults to ["relu", None].

        Returns:
            float: The benchmarked time for the autoquantization process.
        r   rM   c                 (    t          | |          |z  S r$   r   )x_vals_int8x_scalesw_vals_int8s      r)   r   zHAQInt8DynamicallyQuantizedLinearWeight._autoquant_test.<locals>.<lambda>  s    {[8 8 8 r+   r   r   Nr   r   rr   z matmul, to_beat: rs   z# interpolated, breakeven constant: z0.2f)r   r   r^   r[   r   rN   r8   r:   r   rW   original_weight_tensortensor_implint_data
contiguoustr   r\   )r'   rO   r=   rQ   r`   r1   INTERPOLATION_CONSTANTr   r  r  quantized_matmul
q_c_matmulr  
res_matmulto_beatr-   max_int_const_winres_frH   s                     r)   r^   z6AQInt8DynamicallyQuantizedLinearWeight._autoquant_test  sU     $D)) 	S77**7FD)TRRR "&aNN6**	 EOOBb 122!
 !
X  	 ]#3:VWWW
]__ 	 	0<EPPRRTTVV  ,K)9)9"a)@)@+ J		 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	XzXXXsXXiXXXX	
 	
 	

 "" 4<R8RS
"
 
 gg%%gvtWEE&3j8HI&,4J0Jj/XXjujjj#jjRcjjj	
 	
 	
 s   0ADDD)rI   r   r   r   r   r   r
   __annotations__r   r[   r^   __classcell__rH   s   @r)   r   r     s           $Iv%%%+ + + + [+Z EKTN 8 8 8 8 8 [8 8 8 8 8r+   r   c                   V     e Zd ZU  e            Zeed<   eddgf fd	            Z xZ	S )0AQInt8DynamicallyQuantizedSemiSparseLinearWeightr   r0   Nc                 N    t                                          ||||d           S r$   )r   r^   )r'   rO   r=   rQ   r`   r1   rH   s         r)   r^   z@AQInt8DynamicallyQuantizedSemiSparseLinearWeight._autoquant_test  s#    ww&&wiNNNr+   )
rI   r   r   r   r   r
   r  r   r^   r  r  s   @r)   r  r    su          )(**Iv***EKTN O O O O O [O O O O Or+   r  c                   2     e Zd ZdZe fd            Z xZS )%AQInt8WeightOnlyQuantizedLinearWeightzH
    AutoQuantizable version of Int8WeightOnlyQuantizedLinearWeight
    c                    t           j        }t          j        }t          j        t          j                  j        }t          j        }d|j        d         f}t          t          |                               ||||||          S )Nr   )r   r   )r   r   r:   r   r   r   r   r   r8   r   r  from_hp_to_intx)r'   r=   r  r  r   r   r  rH   s          r)   r[   z0AQInt8WeightOnlyQuantizedLinearWeight.from_float  sy    ",zk%-((, ;a)
:C@@PP- Q 
 
 	
r+   rI   r   r   r   r   r[   r  r  s   @r)   r  r    sN          
 
 
 
 [
 
 
 
 
r+   r  c                   H     e Zd ZdZed             Ze fd            Z xZS )&AQInt8WeightOnlyQuantizedLinearWeight2i
    AutoQuantizable version of Int8WeightOnlyQuantizedLinearWeight that
    uses a different kernel
    c                    | j         }| j        }|                     d| j        d         d          } | |j        j                                                            d          z                      d          } |j        g |dd         |j        d         R  |j        j        z  }|||z  }|	                    |          S )aT  
        Performs the quantized linear operations

        Args:
            act_mat (torch.Tensor): The activation matrix.
            w_qtensor (torch.Tensor): The quantized weight tensor.
            bias (torch.Tensor or None): The bias tensor.

        Returns:
            torch.Tensor: The result of the quantized operation.
        rM   r   r   )r   N)
r6   r8   rN   r
  r  r  	unsqueezesumscaler   )rO   r   rQ   
orig_dtype
orig_shapeys         r)   rZ   z;AQInt8WeightOnlyQuantizedLinearWeight2._quantized_linear_op8  s     ]
]
//"gmB&7;;y,57799CCAFFFKKPRKSSAI4z#2#4444y7L7RRIAttJr+   c                     |                     d|j        d                   j        d         dk    rt          j        S  t	                      j        |g|R  S )NrM   r       )rN   r8   r:   r]   r   r^   )r'   rO   r?   rH   s      r)   r^   z6AQInt8WeightOnlyQuantizedLinearWeight2._autoquant_testN  sT     ??2w}R0117:R??9&uww&w66666r+   )	rI   r   r   r   r   rZ   r   r^   r  r  s   @r)   r!  r!  0  si         
     \ * 7 7 7 7 [7 7 7 7 7r+   r!  c                   (    e Zd ZdZed             ZdS )&AQInt8WeightOnlyQuantizedLinearWeight3r"  c                    | j         }t          j        |                     d|d                   |j        j                                        |j        j        z            } |j        g |d d         |j         d         R  }|||z  }|S )NrM   )r8   r:   mmrN   r
  r  r  r'  )rO   r   rQ   r)  r*  s        r)   rZ   z;AQInt8WeightOnlyQuantizedLinearWeight3._quantized_linear_op^  s    ]
HOOB
2//!*,,..1F1LL
 
 AI4z#2#4444IAr+   N)rI   r   r   r   r   rZ   r   r+   r)   r.  r.  V  s9         
 	 	 \	 	 	r+   r.  c                   d     e Zd ZU dZdZeed<    ed          Ze	ed<   e
 fd            Z xZS )(AQInt4G32WeightOnlyQuantizedLinearWeightz5
    AutoQuantizable version of int4_weight_only
    r,  
group_size   )inner_k_tilesr   c                    ddl m} | j        }| j        }|j        d         |z  dk    r|S d }t          |t                    r<|j        t          j	        k    r'|
                    t          j	                  }t          }nXt          |t                    r<|j        t          j        k    r'|
                    t          j                  }t          }nt          }d}t           j        }d|f}t          j        }	d}
d}d}d}t          j	        }t&          j        }t          |t                    rt           j        }d}t&          j        }d} |||||	|
|||||||	          }t/          t0          |                               ||          S )
Nr   r   rM   Tr      gư>F)r   preserve_zerozero_point_domainr   use_hqq)r   r   r3  r   r8   r   r	   r6   r:   r   r   r   r   r   r   r   r   
ASYMMETRICint32r   FLOATr   INTr   r2  r[   )r'   r=   r   r3  r   r   r:  r  r  r  	quant_min	quant_maxr   r8  r   r9  rH   s                   r)   r[   z3AQInt4G32WeightOnlyQuantizedLinearWeight.from_floatv  s   ;;;;;;^
-<j(A--M w 566
	)..YYu~..F+!344 	)9V9VYYu}--F*("-_
{		 >+1g122 	&0L M / 3G))-'/
 
 
 =sCCNN$
 
 	
r+   )rI   r   r   r   r3  intr  r	   r   r
   r   r[   r  r  s   @r)   r2  r2  k  s~           J--A>>>Iv>>><
 <
 <
 <
 [<
 <
 <
 <
 <
r+   r2  c                       e Zd ZU dZeed<   dS )(AQInt4G64WeightOnlyQuantizedLinearWeight@   r3  NrI   r   r   r3  rA  r  r   r+   r)   rC  rC    $          Jr+   rC  c                       e Zd ZU dZeed<   dS ))AQInt4G128WeightOnlyQuantizedLinearWeight   r3  NrE  r   r+   r)   rH  rH    $          Jr+   rH  c                       e Zd ZU dZeed<   dS ))AQInt4G256WeightOnlyQuantizedLinearWeight   r3  NrE  r   r+   r)   rL  rL    rJ  r+   rL  c                   <    e Zd ZU dZeed<    e            Zeed<   dS )5AQInt4G128WeightOnlyQuantizedMarlinSparseLinearWeightrI  r3  r   N)	rI   r   r   r3  rA  r  r   r   r
   r   r+   r)   rO  rO    s=          J**,,Iv,,,,,r+   rO  c                   >     e Zd ZU dZeed<   e fd            Z xZS )/AQGemliteInt4G32WeightOnlyQuantizedLinearWeightr,  r3  c                 2   ddl m} ddlm} |j        t
          j        k    r|                    t
          j                  }d}d }d}d} ||| j        ||||          } ||fi |}t          }	t          t          |                               ||	          S )Nr   r   )get_gemlite_aqt_kwargs   weight_onlyT)r3  	bit_widthpacking_bitwidthr1   r:  )r   r   -torchao.prototype.dtypes.uintx.gemlite_layoutrS  r6   r:   r   r   r3  r   r   rQ  r[   )r'   r=   r   rS  rV  rW  r1   r:  
aqt_kwargsr   rH   s             r)   r[   z:AQGemliteInt4G32WeightOnlyQuantizedLinearWeight.from_float  s    ;;;;;;XXXXXX<5=((YYu}--F	++~-
 
 

 *)&??J??&DcJJUU$
 
 	
r+   )	rI   r   r   r3  rA  r  r   r[   r  r  s   @r)   rQ  rQ    sV          J
 
 
 
 [
 
 
 
 
r+   rQ  c                       e Zd ZU dZeed<   dS )/AQGemliteInt4G64WeightOnlyQuantizedLinearWeightrD  r3  NrE  r   r+   r)   r[  r[    rF  r+   r[  c                       e Zd ZU dZeed<   dS )0AQGemliteInt4G128WeightOnlyQuantizedLinearWeightrI  r3  NrE  r   r+   r)   r]  r]    rJ  r+   r]  c                       e Zd ZU dZeed<   dS )0AQGemliteInt4G256WeightOnlyQuantizedLinearWeightrM  r3  NrE  r   r+   r)   r_  r_    rJ  r+   r_  c                   N     e Zd ZdZ fdZed             Zed             Z xZ	S )rY   a  
    A class to be used in concert with AutoQuantizableLinearWeight to provide a
    default/non-quantized option. Only implements the bare minimum needed to work with the
    AutoQuantizableLinearWeight class using the same interfaces that would normally be
    used by QTensor subclasses but for a default linear op instead. Result of from_float
    is not a tensor subclass, but rather the float tensor.
    c                 H    t                                                       d S r$   )r   rE   )rD   rH   s    r)   rE   zAQDefaultLinearWeight.__init__  s    r+   c                 N    t           j        j                            | ||          S r$   )r:   r   r   r   rO   r   rQ   s      r)   rZ   z*AQDefaultLinearWeight._quantized_linear_op  s    x"))'9dCCCr+   c                     |S r$   r   r'   r=   s     r)   r[   z AQDefaultLinearWeight.from_float  s    r+   )
rI   r   r   r   rE   r   rZ   r   r[   r  r  s   @r)   rY   rY     s{              D D \D   [    r+   rY   c                   d    e Zd ZdZed	d            Zd	dZed             Zd Ze	d             Z
dS )
Float32Tensorz%Tensor subclass tensor for fp32 dtypeFc                     i }|j         |d<   |                    dd          r|                    d          n|j        |d<   |j        |d<   d|d<   |j        }t          j        j        | |fi |S )Nr4   r5   Fr6   r7   )r4   r&   r5   r6   r8   r:   r;   r<   )r'   r=   skip_weight_conversionr@   r8   s        r)   rA   zFloat32Tensor.__new__  s    !=x$*JJx$?$?RFJJx   V] 	x !,w"'|23HHHHHr+   c                 V    |r|n|                     t          j                  | _        d S r$   )r   r:   r   r=   rD   r=   ri  s      r)   rE   zFloat32Tensor.__init__(  $     6TffFIIem<T<Tr+   c                     t           j        }| j        }t           j        j                            |                     |          |j        ||                    |          n|                              |          S N)r6   )r:   r   r6   r   r   r   r   r=   rO   r   rQ   _DTYPEr(  s        r)   rZ   z"Float32Tensor._quantized_linear_op+  i    ]
x"))JJv#/DGGFOOOT
 
 ":"

		r+   c                 H    |                       || j                            S r$   )rH   r=   r   s     r)   r   zFloat32Tensor._apply_fn_to_data5  s%    ~~Bt{OO
 
 	
r+   c                      | |          S r$   r   re  s     r)   r[   zFloat32Tensor.from_float:  s    s6{{r+   NF)rI   r   r   r   r   rA   rE   rZ   r   r   r[   r   r+   r)   rg  rg    s        //	I 	I 	I \	IU U U U   \
 
 

   [  r+   rg  c                     |d         |d         t          |          dk    r|d         nd }}}|                    |||          S )Nr   r   r   )r   rZ   )r   r   r?   r@   input_tensorweight_tensorrQ   s          r)   _rx  ?  sN     	QQt99q==Qd "&-L
 --lM4PPPr+   c                 l    t          | |||d                             t          j                            S r   )r   r   r:   r   r   r   r?   r@   s       r)   rx  rx  J  s1    &dFDG55elCC  r+   c                 l    t          | |||d                             t          j                            S r   )r   r   r:   clonerz  s       r)   rx  rx  Q  s1    &dFDG55ekBB  r+   c           
          t          | || |d         j        |dd          i |                    t          j                            S )Nr   r   )r   r   r   r:   r|  rz  s       r)   rx  rx  X  sM    &Q
DH'''99%+FF	  r+   c                   D    e Zd ZddZed             Zedd            ZdS )BFloat16TensorFc                 V    |r|n|                     t          j                  | _        d S r$   )r   r:   r   r=   rk  s      r)   rE   zBFloat16Tensor.__init__c  s$     6UffFIIen<U<Ur+   c                     t           j        }| j        }t           j        j                            |                     |          |j        ||                    |          n|                              |          S rn  )r:   r   r6   r   r   r   r   r=   ro  s        r)   rZ   z#BFloat16Tensor._quantized_linear_opf  si    ]
x"))JJv#/DGGFOOOT
 
 ":"

		r+   c                      | ||          S r$   r   r'   r=   ri  s      r)   r[   zBFloat16Tensor.from_floatp      s61222r+   Nrt  rI   r   r   rE   r   rZ   r   r[   r   r+   r)   r  r  b  sg        V V V V   \ 3 3 3 [3 3 3r+   r  c                   D    e Zd ZddZed             Zedd            ZdS )Float16TensorFc                 V    |r|n|                     t          j                  | _        d S r$   )r   r:   r   r=   rk  s      r)   rE   zFloat16Tensor.__init__v  rl  r+   c                     t           j        }| j        }t           j        j                            |                     |          |j        ||                    |          n|                              |          S rn  )r:   r   r6   r   r   r   r   r=   ro  s        r)   rZ   z"Float16Tensor._quantized_linear_opy  rq  r+   c                      | ||          S r$   r   r  s      r)   r[   zFloat16Tensor.from_float  r  r+   Nrt  r  r   r+   r)   r  r  u  sg        U U U U   \ 3 3 3 [3 3 3r+   r  c                   2     e Zd ZdZe fd            Z xZS )AQFloat32LinearWeightz
    AutoQuantizable version for float32 precision weight

    (also converts input activation and bias to float32, and restores the original precision after
    linear)
    c                 T    t          t          |                               |          S r$   )r   r  r[   r'   r=   rH   s     r)   r[   z AQFloat32LinearWeight.from_float  "    *C00;;FCCCr+   r  r  s   @r)   r  r    W          D D D D [D D D D Dr+   r  c                   2     e Zd ZdZe fd            Z xZS )AQBFloat16LinearWeightz
    AutoQuantizable version for bfloat16 precision weight

    (also converts input activation and bias to bfloat16, and restores the original precision after
    linear)
    c                 T    t          t          |                               |          S r$   )r   r  r[   r  s     r)   r[   z!AQBFloat16LinearWeight.from_float  s"    +S11<<VDDDr+   r  r  s   @r)   r  r    sW          E E E E [E E E E Er+   r  c                   2     e Zd ZdZe fd            Z xZS )AQFloat16LinearWeightz
    AutoQuantizable version for float16 precision weight

    (also converts input activation and bias to float16, and restores the original precision after
    linear)
    c                 T    t          t          |                               |          S r$   )r   r  r[   r  s     r)   r[   z AQFloat16LinearWeight.from_float  r  r+   r  r  s   @r)   r  r    r  r+   r  c                   l     e Zd ZU dZej        Zej        ed<   e	d             Z
e fd            Z xZS )'AQFloat8WeightOnlyQuantizedLinearWeightzo
    AutoQuantizable version of Float8WeightOnlyQuantizedLinearWeight for target_dtype=torch.float8_e4m3fn
    r  c                 r    t           j        j                            | |                                |          S r$   )r:   r   r   r   
dequantizerc  s      r)   rZ   z<AQFloat8WeightOnlyQuantizedLinearWeight._quantized_linear_op  s+    x"))'93G3G3I3I4PPPr+   c                     d|j         d         f}t          t          |                               ||| j        t                                S )Nr   )r  r   )r8   r   r  from_hp_to_floatxr  r   )r'   r=   r  rH   s      r)   r[   z2AQFloat8WeightOnlyQuantizedLinearWeight.from_float  sL    a)
<cBBTTJS-=|~~ U 
 
 	
r+   )rI   r   r   r   r:   float8_e4m3fnr  r6   r  r   rZ   r   r[   r  r  s   @r)   r  r    s           !& 3L%+333Q Q \Q 
 
 
 
 [
 
 
 
 
r+   r  c                   F     e Zd ZdZ e            Ze fd            Z xZS )5AQFloat8PerRowScalingDynamicallyQuantizedLinearWeightza
    AutoQuantizable version of Float8DynamicallyQuantizedLinearWeight using per row scaling
    c                 H   ddl m} ddlm} d }t          j        }t          j        }t          t          d                    }|}| j        |d}	 ||          }
 |||
||t          j	        	          }t          t          |                               |||	          }|S )
Nr   to_affine_quantized_floatx _input_activation_quant_func_fp8c                      d| j         d         fS r   r   r   s    r)   r   z_AQFloat8PerRowScalingDynamicallyQuantizedLinearWeight.from_float.<locals>.get_weight_block_size  r   r+   Tuse_fast_accum	mm_configactivation_granularityactivation_dtypeinput_floatr  r  r   scale_dtype)r   r  r   r  r:   r  r   r   r  r   r   r  r[   r'   r=   r  r  r   r  input_target_dtyper   r   input_quant_argsr  rH   s              r)   r[   z@AQFloat8PerRowScalingDynamicallyQuantizedLinearWeight.from_float  s     	>=====SSSSSS	# 	# 	# *"0t)L)L)LMMM;&)&@ 2
 
 +*622
++!%
 
 
 A3
 

*V-/?
@
@ 	 r+   )	rI   r   r   r   r   r  r   r[   r  r  s   @r)   r  r    sZ          $VXX    [    r+   r  c                   F     e Zd ZdZ e            Ze fd            Z xZS )8AQFloat8PerTensorScalingDynamicallyQuantizedLinearWeightzd
    AutoQuantizable version of Float8DynamicallyQuantizedLinearWeight using per tensor scaling
    c                 H   ddl m} ddlm} d }t          j        }t          j        }t          t          d                    }|}| j        |d}	 ||          }
 |||
||t          j	        	          }t          t          |                               |||	          }|S )
Nr   r  r  c                 :    | j         dk    s
J d            | j        S )Nr   zOnly works for 2D tensors)ndimr8   r   s    r)   r   zbAQFloat8PerTensorScalingDynamicallyQuantizedLinearWeight.from_float.<locals>.get_weight_block_size  s    6Q;;; ;;;;7Nr+   Tr  r  r  r  )r   r  r   r  r:   r  r   r   r  r   r   r  r[   r  s              r)   r[   zCAQFloat8PerTensorScalingDynamicallyQuantizedLinearWeight.from_float  s     	>=====SSSSSS	 	 	 *"0t)L)L)LMMM;&)&@ 2
 
 +*622
++!%
 
 
 Dc
 

*V-/?
@
@ 	 r+   )	rI   r   r   r   r   r  r   r[   r  r  s   @r)   r  r    sZ          'Y[[    [    r+   r  c                 X   ddl m} |                    d|          }|                    dd          }|                    dt                    |d<   |                    ddd	g          |d<   |                    d
d	          |d
<   ddl m}m}  ||  |t          fi |||n|           d	S )z
    Converts all linear weight tensors to the
    AutoQuantizableLinearWeight tensor subclass. Expectation is that this is followed
    by running the model and then calling _change_autoquantizable_to_quantized
    r   )
_is_linear	filter_fnry   Tr>   r1   r0   Nr2   _get_subclass_inserter)_replace_with_custom_fn_if_matches_filter)r   r  r9   r&   r   r  r  r   )modelr@   r  r  rx  r  r  s          r)   "_change_linears_to_autoquantizabler  ]  s    :99999

;
33I

4	 	A $*:::$ $F  ZZ77F6NJ55F:       
 .-:EEfEE*		
    r+   Tc           	         t           j        j        j        }dt           j        j        _        |r[t           j        j        j        }dt           j        j        _        ddl}t           j                            |j        |j                   |	                    dd           }|	                    dd          }dd	l
m}m}	  |	|  |t          fd
|d||           |t           j        j        _        |r4|t           j        j        _        t           j                                         t           j                                         dS )z
    Converts AutoQuantizableLinearWeight tensor subclasses
    to various quantized/non-quantized tensor subclasses depending
    on benchmark results. Expectation is that these modules are
    torch.compiled afterwards.
    FTr   N)inductordynamor  c                 V    t          | d          ot          | j        t                    S r   )hasattrr   r=   r   )modr?   s     r)   r   z6_change_autoquantizable_to_quantized.<locals>.<lambda>  s)    7311 @sz#>?? r+   ry   r  r   )methodry   )r:   rw   configautomatic_dynamic_shapessuppress_errorslogging_loggingset_logsCRITICALr9   r   r  r  r   rx   )
r  supress_autoquant_errorsr@   hold_automatic_dynamic_shapeshold_supress_errorsr  r  ry   r  r  s
             r)   $_change_autoquantizable_to_quantizedr  z  sn    %*M$8$Q!49EM1 T#m2B/3,)9'BRSSS

	@ 	@ I
 jj!2D99O       
 .-'	
!+	
 	
 		
 	
 		 	 	 5REM1   "/B,!!!	Mr+   r   g333333?Fc	                    	 t           j                            d           |r#t          j        j                                         |t          u rJt           j        	                                r"t           j        
                                dk    s
J d            t           f||||d	 t           t           j        j        j                  }
|
r j        n r|
r j        _        j         _        nfd}                     |d          	 fd}|_        t          |t           j                  r|g}t          |t,          t.          f          r  |   S )	a  
    Autoquantization is a process which identifies the fastest way to quantize each layer of a model over some set of potential
    qtensor subclasses.

    Autoquantization happens in three steps:

    1-Prepare Model: the model is searched for Linear layers whose weights are exchanged for AutoQuantizableLinearWeight.
    2-Shape Calibration: the user runs the model on one or more inputs, the details of the activation shape/dtype seen by
        the AutoQuantizableLinearWeight are recorded so we know what shapes/dtypes to use in order to optimize the quantized op in step 3
    3-Finalize Autoquantization: for each AutoQuantizableLinearWeight, benchmarks are run for each shape/dtype on each member of the qtensor_class_list.
        the fastest option is picked, resulting in a highly performant model

    This autoquant function performs step 1. Steps 2 and 3 can be completed by simply running the model.
    If `example_input` is provided, this function also runs the model (which completes steps 2 and 3).
    This autoquant api can handle models which have already had torch.compile applied to them, in which case, once the model is run and quantized,
    the torch.compile process normally proceeds as well.

    To optimize over a combination of input shapes/dtypes, the user can set manual=True, run the model with all desired shapes/dtypes, then
    call model.finalize_autoquant to finalize the quantization once the desired set of inputs have been logged.

    Args:
        model (torch.nn.Module): The model to be autoquantized.
        example_input (Any, optional): An example input for the model. If provided, the function performs a forward pass
                                       on this input (which fully autoquantizes the model unless manual=True). Defaults to None.
        qtensor_class_list (list, optional): A list of tensor classes to be used for quantization. Defaults to DEFAULT_AUTOQUANT_CLASS_LIST.
        filter_fn (callable, optional): A filter function to apply to the model parameters. Defaults to None.
        mode (list, optional): A list containing mode settings for quantization. The first element is the mode type (e.g., "interpolate"),
                               and the second element is the mode value (e.g., 0.85). Defaults to ["interpolate", .85].
        manual (bool, optional): Whether to stop shape calibration and do autoquant after a single run (default, False) or to wait for
                                the user to call model.finalize_autoquant (True) so inputs with several shapes/dtypes can be logged.
        set_inductor_config (bool, optional): Whether to automatically use recommended inductor config settings (defaults to True)
        supress_autoquant_errors (bool, optional): Whether to suppress errors during autoquantization. (defaults to True)
        min_sqnr (float, optional): minimum acceptable signal to quantization noise ration (https://en.wikipedia.org/wiki/Signal-to-quantization-noise_ratio) for output of quantized layer v.s. non-quantized layer, this is used to filter
        out quantization methods that causes too large numerical impact, user can start with a resaonable
        number like 40 and adjust depending on the result
        **aq_kwargs: Additional keyword arguments for the autoquantization process.

    Returns:
        torch.nn.Module: The autoquantized and wrapped model. If `example_input` is provided, the function performs a forward pass
                         on the input and returns the result of the forward pass.

    Example usage:
        torchao.autoquant(torch.compile(model))
        model(*example_input)

        # multiple input shapes
        torchao.autoquant(model, manual=True)
        model(*example_input1)
        model(*example_input2)
        model.finalize_autoquant()
    ztorchao.quantization.autoquant)r4  	   z float8 requires CUDA arch >= 8.9)r  r>   r1   r2   c                 N     j         |i | |                                  ||fS r$   )forwardfinalize_autoquant)moduler?   r@   
real_models      r)   autoquant_prehookz$autoquant.<locals>.autoquant_prehook  s7    J////%%'''<r+   T)with_kwargsc                      t          fi   t          d          rj        _        t	          d           t          d          rt	          d           s                                 d S d S )Nold_forwardr  )r  r  r  r  delattrremove)	aq_kwargshandlemanualr  r  r  s   r)   r  z%autoquant.<locals>.finalize_autoquant$  s    ,$	
 	
 	
 	
 	

 :}-- 	/&2EMJ...:344 	6J 4555 	MMOOOOO	 	r+   )r:   r   _log_api_usage_oncetorchaoquantizationutils"recommended_inductor_config_setterr!   r   is_availableget_device_capabilityr  r   rw   
eval_frameOptimizedModule	_orig_modr  r  register_forward_pre_hookr  r;   tupler   )r  example_inputr>   r  r1   r  set_inductor_configr  r2   r  is_compiledr  r  r  r  s   `    ` ` `   @@r)   r   r     s   @ 
H  !ABBB H"EEGGG777z&&(( 	.UZ-M-M-O-O T
 .
 .
 .
 ..
 .
 
 '-     UEM$<$LMMK _


 V  	/%*]J"&.EM	  	  	  	  	  001BPT0UU          %7J! -.. (&-%// }Lr+   ru   )br:   torch.nn.functionalr   r   r   torch.utils._python_dispatchr   r  r   r   r   r   r   r   r	   torchao.dtypes.utilsr
   torchao.float8.inferencer   torchao.kernelr   7torchao.quantization.linear_activation_quantized_tensorr   %torchao.quantization.quant_primitivesr   r   torchao.quantization.utilsr   r   torchao.utilsr   r   r   r   granularityr   r   __all__opsr   r%   r*   r.   r;   r   rW   r   r   r   r   r   r   r   r  r  r!  r.  r2  rC  rH  rL  rO  rQ  r[  r]  r_  rY   rg  implements_torch_functionr   
implementsr   rx  r   r|  _to_copyr  r  r  r  r  r  r  r  r   r   r   r   r!   r    r"   r   setr  r  r   serializationadd_safe_globalsr   r+   r)   <module>r     s
             D D D D D D                 ( ' ' ' ' ' 3 3 3 3 3 3 & & & & & &                                     

 
 
 y~ A A A6 6 6
v v v v v%, v v vr   D  5< EL     EL  U\         %,    ! ! ! ! ! ! ! !Hn n n n nW6U n n nbO O O O O*O O O
 
 
 
 
,A7 
 
 
,#7 #7 #7 #7 #7)7#7 #7 #7L    )7  *H
 H
 H
 H
 H
#WH
 H
 H
V    ,      ,      ,  - - - - -,- - -
 
 
 
 
#W
 
 
D    3      3      3      EL'   ,# # # # #% # # #L (()<)CDD$+-..Q Q /. EDQ $+-..  /. $*,--  .- $-/00  103 3 3 3 3] 3 3 3&3 3 3 3 3M 3 3 3&
D 
D 
D 
D 
DM7 
D 
D 
D
E 
E 
E 
E 
E^W 
E 
E 
E
D 
D 
D 
D 
DM7 
D 
D 
D
 
 
 
 
.CW 
 
 
&' ' ' ' ',' ' 'T( ( ( ( (,( ( (Z )* +   *,% ! *3% ! & " +9<	  94' # !'(()  3    /@! 
  X!V WW D CC  4$< = =>>   < %). . . .h  3
	!K K K K\   $ $%= > > >   $ $    r+   