
    fPiǍ                     "   d Z ddlZddlZddlZddlZddlZddlZddlmZ ddl	m
Z
 ddlZddlmZ ddlmZ  ej        d          Z	 dd	Zd dZd!dZd!dZd dZd Zi d
ddi ddgdfdZd Zd Z	 	 	 	 	 	 	 	 d"dZi d
ddddddddddgfdZdS )#zWeightOnly for onnxrt adaptor.    N)numpy_helper)np_dtype_to_tensor_dtype   )	ONNXModel)simple_progress_barneural_compressorc	           	      2   ||z  dz  }	t          j        |j        d         |	fd          }
| j        d         d|d|z   }| j        d         |g}g }i }d}|d	k    r:|d
d
d
d
df         |d
d
dd
df         d	z  z  }|d
d
d
|	f         |
d
d
d
d
f<   n'|dk    r|}
nt                              d| d           t          j        |
d||	f          }
t          j        |d|f          }|j        t           j        k    s|j        t           j	        k    sJ t          j                            | j        d         dz   t          |j                  |j        |                                d          }|                    |j                   |                    |           ||dk    r|                    d          }n|d	k    rt          j        |j        d         dz   dz  dd          }t          j        |j        d         |z  |z                                d          }|d
d
d         }|dd
d         }||dz           dz  ||                                         z  ||dz  <   ||dz           dz  ||                                         d	z  z  ||dz  <   nt+          d| d          t          j        ||d         df          }t          j                            | j        d         dz   d|j        |                                d          }|                    |j                   |                    |           |d         |d<   |d         |d<   ||d<   ||d<   |dk    r||d<   t          j                            |d|
j        |
                                d          }|                    |           t          j        j        |f|| j        | j        r| j        dz   t1          |          z   ndt1          |          z   dd|}||fS )aB  Build MatMulNBits node.

    Args:
        node: original matmul node
        weight_shape: original weight shape
        num_bits (int): num_bits
        group_size (int): how many elements share one scale/zp
        k_blocks (int): block number
        q_weight (array): quantized weight
        scale (array): scale
        zero_point (array): zero point
        accuracy_level (int): accuracy level. Support 0 (unset), 1(fp32), 2(fp16), 3(bf16), or 4(int8).

    Returns:
        matmul_weight_only_node: MatMulNBits node
        new_inits: initializers of the new node
       r   uint8dtyper   _QGMatMulNBits   N   z8MatMulNBits does not have kernel support for num_bits = ._scaleTname	data_typedimsvalsraw         _zpKNbits
block_sizeaccuracy_levelzcom.microsoft)inputsoutputsr   domain)npzerosshapeinputloggererrorreshaper   float32float16onnxhelpermake_tensorr   tobytesappendr   astypefullarangeravel
ValueError	make_nodeoutputstr)nodeweight_shapenum_bits
group_sizek_blocksq_weightscale
zero_pointr$   	blob_sizepackedq_weight_nameinput_names	new_initskwargsop_typeq_weight_pairsscale_tensor	packed_zpidxeven_idxodd_idx	zp_tensorq_weight_tensormatmul_weight_only_nodes                            /home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/onnxruntime/quantization/neural_compressor/weight_only.pymake_matmul_weight_only_noderW   ,   s   8 X%*IXx~a()4GDDDFJqM$D$D$Dj$D$DDM:a=-0KIFG 1}}!!!!SSqS&)HQQQ1W,=,BB%aaa)m4qqq!!!t	Q[PX[[[\\\ZXy 9::F Jur8n--E;"*$$rz(A(A(AA;**Z]X%*5;77[]]__ +  L |()))\""" q=="))'22II]]!1!!4q!8Q >7SSSI)J,Q/8;hFGGOOPRSSC33Q3xH!$Q$iG(1(a-(@4(G:V^K_KeKeKgKg'gIh!m$'0A'>'E*U\J]JcJcJeJeijJj&kIgl##cX`cccdddJy<?B*?@@	K++A&!)/PYPaPaPcPcim , 
 
	 	9>***### q/F3Kq/F3KF6N%F<#1 k--\^^ .  O _%%%"k315TTYH--s8}}@T    #I--    r       asymint      ?c           	         t          j        | d|f          } |dk    s|dk    rd|z  dz
  dn*|dk    r$|dk    rd|dz
  z  dz
  nd|dk    r	d|dz
  z   ndt          j        | dd	          |z  }t          j        | dd	          |z  }|dk    rt          j        t          j        |          t          j        |                    }t          j        |j                  }	|dk    }
||
         d
z                      t           j	                  z
  z  |	|
<   |dk    rt          j
        |	j                  n#t          j        |j        d          d|dz
  z  z  }nt          j        |j                  }	t          j        fd||z
  ||k                                                                             D                       |	||k    <   |dk    r1t          j
        |	j                  |z
  |	z                                  nit          j        dt          j        t          j
        |	j                  |z
  |	z                                                                          d          }t          j        | |	j                  }t          j        | |	|           t          j        |||           t          j        ||           t          j        ||           ||	|fS )a	  Quantize tensor per group.

    Args:
        data : input weight
        num_bits (int, optional): num_bits. Defaults to 4.
        group_size (int, optional): how many elements share one scale/zp. Defaults to 4.
        scheme (str, optional): quantization scheme. Defaults to "asym".
        dtype (str, optional): data type. Defaults to "int".
        ratio (float, optional): percentile of clip. Defaults to 1.0.

    Returns:
        output: quantized weight
        scale: scale
        zero_point: zero point
    r   rZ   uintr   r   r   symTaxiskeepdimsg       @r[   r   r   c                 :    g | ]}t          |          z
  z  S  )float).0imaxqminqs     rV   
<listcomp>z quant_tensor.<locals>.<listcomp>   s(    ^^^!U1XX%^^^rX   out)r(   r.   minmaxmaximumabsonesr*   r6   float64r)   arrayflattentolistroundminimum
empty_liker   divideaddclip)datar@   rA   schemer   ratiorminrmax	max_rangerD   maskrE   rC   rh   ri   s                @@rV   quant_tensorr      s     :dR,--D5F??({Q	5*2a--qX\"Q&&Q)1Qx!|$%%B6$Q...6D6$Q...6DJrvd||RVD\\::	
##1} ,44RZ@@D4KPd%*e^^BHU[!!!SZ9[9[9[_`empqeq_r9s 	
 
## h^^^^^tTT\/J/R/R/T/T/[/[/]/]^^^
 
ddl
 ~~ hu{##d*e3::<<<Arz$"(5;2G2G$2NRW1W0^0^0`0`aabbiijqrr 	 }T555HIdEx((((F8ZX....HX8$$$$GHdDh////UJ&&rX   c                    t          j        | d|f                              t           j                  } d|z  dz
  }d}t          j        | dz  dd          }t          j        ||z            }t          j        |t          j        |                     }t          j        | dd          }t          j	        | dd          }	t          j        |dd          }
t          j        || z  dd          }t          j
        |	j        | j                  }||	k    }||z
  |	|         ||         z
  z  ||<   d|z  }t          j        t          j        || |z
  z            ||          }||z  |z   | z
  }t          j        ||dz  z  dd          }d}d	}d}t          |          D ]}t          j
        |	j        | j                  }t          j        |||z  z   |z   |z
  g                              | j                  d         }||	k    }||	|         ||         z
  z  ||<   t          j        t          j        || |z
  z            ||          }||z  }t          j        |dd          }t          j        ||z  dd          }t          j        || z  dd          }t          j        |
|z  |dz            }|
|z  ||z  z
  |z  }||z  ||z  z
  |z  }||z  |z   | z
  }t          j        ||dz  z  dd          } t          j        |           }!t          j        |          }"t          j        |!|"k               d         }#||#d
d
f         ||#d
d
f<   | |#         ||#<   ||#         ||#<   ||#         ||#<   t          j        | |z                                  d|                              d          }$|                    t           j                  }t          j        | |j                  }%t          j        | ||%           t          j        |%|$|%           t          j        |%|%           t          j        |%|||%           |%||$fS )a  Quantize tensor per group based on k quant.

    Ref: https://github.com/ggml-org/llama.cpp/blob/64eda5deb9859e87a020e56bab5d2f9ca956f1de/ggml/src/ggml-quants.c

    Args:
        data : input weight
        num_bits (int, optional): num_bits. Defaults to 4.
        group_size (int, optional): how many elements share one scale/zp. Defaults to 32.

    Returns:
        output: quantized weight
        scale: scale
        zero_point: zero point
    r   r   r   r   Tr`   r      皙?Nr   rk   )r(   r.   r6   r/   sumsqrtrz   rp   rm   rn   rq   r*   r   r{   rv   rangers   subtractwhererr   rx   ry   )&r|   r@   rA   rh   ri   sum_x2av_xweightsr   r   sum_wsum_xiscaler   rD   
quant_datadiffbest_madnsteprdeltarrminis_
iscale_newfactorquant_data_newmul_weights_quant_data_newsum_lsum_l2sum_xlD
this_scalethis_minmadmad_1
best_mad_1idx_to_replacerE   rC   s&                                         rV   quant_tensor_k_quant_cpur      sc    :dR,--44RZ@@Dh;?DDVD!G!d333F76J&''DfT26$<<((G6$Q...D6$Q...DF7T222EF7T>D999EWTZtz222F4<D4KDJd$;<F4LJE&D4K"8994FFJ:$t+Dvga'a$???HEFEU|| 8 8WTZtz:::
56C</$6=>??FFtzRRSTUt|!T$Z$t*%<=
4*t*D!E!EtTRR%,~%="1DIII2^C!VZ[[[2T9DQQQKq11fnuu}49
UNUV^3q8N*X5<fWtQw&Q>>>Xh''
%*"455a8(6~qqq7H(I
>111$%#&~#6  *> :n'7^D5E/0022At<<CCGLLJLL$$E}T555HIdEx((((F8ZX....HX8$$$$GHdDh////UJ&&rX   c                 
   	 ddl }ddl}|j                                        r|                    |           } |                     d|f                              |j                  } d|z  dz
  }d}|                    | dz  dd          }|	                    ||z            }|
                    ||                    |                     }	|                    | dd          }
|                    | dd          }|                    |	dd          }|                    |	| z  dd          }|                    |j        | j                  }|
|k    }||z
  ||         |
|         z
  z  ||<   d|z  }|                    |                    || |
z
  z            ||          }||z  |
z   | z
  }|                    |	|dz  z  dd          }d	}d
}d}t'          |          D ]}|                    |j        | j                  }|                    |||z  z   |z   |z
  g                              | j                  d         }|
|k    }|||         |
|         z
  z  ||<   |                    |                    || |
z
  z            ||          }|	|z  }|                    |dd          }|                    ||z  dd          }|                    || z  dd          }|                    ||z  |dz            }||z  ||z  z
  |z  } ||z  ||z  z
  |z  }!| |z  |!z   | z
  }|                    |	|dz  z  dd          }"|                    |"          }#|                    |          }$|                    |#|$k               d         }%||%ddf         ||%ddf<   |"|%         ||%<   | |%         ||%<   |!|%         |
|%<   |                    |
 |z                                  d|                              d          }&|                    |j                  }|                    | |j                  }'|                    | ||'           |
                    |'|&|'           |                    |'|'           |                    |'|||'           |'                                |                                |&                                fS t6                              d           t;          | ||          S # t<          $ r. t6                              d           t;          | ||          cY S w xY w)a  Quantize tensor per group based on k quant.

    Ref: https://github.com/ggml-org/llama.cpp/blob/64eda5deb9859e87a020e56bab5d2f9ca956f1de/ggml/src/ggml-quants.c

    Args:
        data : input weight
        num_bits (int, optional): num_bits. Defaults to 4.
        group_size (int, optional): how many elements share one scale/zp. Defaults to 4.

    Returns:
        output: quantized weight
        scale: scale
        zero_point: zero point
    r   Nr   r   r   Tr`   r   r   r   r   rk   zqTry to use k-quant quantization on CUDA. However, CUDA is not available.Fall back to k-quant quantization on CPU.zNow we are using k-quant quantization on cpu, which is time consuming.Please consider install cupy to speed up on CUDA. See https://cupy.dev/Please also install torch to check CUDA availability.) cupytorchcudais_availableasarrayr.   r6   r/   r   r   rz   rp   rm   rn   rq   r*   r   r{   rv   r   rs   r   r   rr   rx   ry   getr,   warningr   ImportErrorinfo)(r|   r@   rA   cpr   rh   ri   r   r   r   r   r   r   r   r   r   rD   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   rE   rC   s(                                           rV   quant_tensor_k_quant_cudar     ss   ID:""$$ >	H::d##D<<Z 01188DDDh;?DDVVD!G!dV;;F776J.//DffT266$<<00G66$Q666D66$Q666DFF7TF::EFF7T>DFAAEWWTZtzW::F4<D 4KDJd,CDF4LJE&D4K*@!A!A4NNJ:%,t3Dvvga/a$vGGHEFEU|| @ @WWTZtzWBB
56C<#7$#>#E"FGGNNtzZZ[\]t|#)T$Z$t*-D#E
4 !#*t2L)M)MtUY!Z!Z-4~-E*9DQQ :^ KRS^bcc :T ATXYYKKq99#fnuu}<A
"UNUV^;q@!N2X=DffWtQw.QfFFXXh//
!#%**<!=!=a!@0>~qqq?P0Q
>111,-+.~+>((2>(Bn%'/'?^$$D5E/!8!8!:!:AtDDKKGTTJLL,,E}}T}==HIIdExI000FF8ZXF666HHX8H,,,GGHdDhG777<<>>599;;
0@0@@@NN<   ,D(JGGG D D DD	
 	
 	

 (h
CCCCCDs   ST *T 5T=<T=c                 v    | j         }t          | |||||          \  }}}	t          j        |||	z
  z  |          S )a  Quant dequant tensor per group.

    Args:
        data : input weight
        num_bits (int, optional): num_bits. Defaults to 4.
        group_size (int, optional): how many elements share one scale/zp. Defaults to 4.
        scheme (str, optional): quantization scheme. Defaults to "asym".
        dtype (str, optional): data type. Defaults to "int".
        ratio (float, optional): percentile of clip. Defaults to 1.0.

    Returns:
        output: quant-dequant weight
    )r*   r   r(   r.   )
r|   r@   rA   r}   r   r~   	org_shapeweightrD   zps
             rV   
qdq_tensorr   y  sE     
I$T8ZPUVVFE2:ev{+Y777rX   c                     |dk    r| S | j         }||z  }||d         z
  }|dk    rt          j        | d|fdfd          } | S )a  Pad tensor rowi so that it can be is divisible by group_size.

    Args:
        weight (array): weight
        group_size (int): how many elements share one scale/zp
        k_blocks (int): the number of block

    Returns:
        weight: paded weight
    r   r   )r   r   constant)r*   r(   pad)r   rA   rB   org_w_shapepadded_rowspad_lens         rV   
pad_tensorr     s]     R,KZ'KKN*G{{!Wv 6
CCMrX   CPUExecutionProviderk_quantc	                    t          |           } | j        $t          j                            | j                  nd}	g }
g }t          d |                                 D                       }d}|                                 D ]}|j        dv r|dz  }t          ||           |j        dv rj| 	                    |j
        d                   I|                    |j        i           dk    r)| 	                    |j
        d                   }t          j        ||	                                          }t          |j                  d	k    r|j        }|j        |v r9||j                 d
         }||j                 d         }||j                 d         }|j        }|dk    r|n|d         }|d         dz
  |z  dz   }|                     |j
        d                   }t'          |||          }|dk    p|dk    }|r|dk    rt)          |j        ||          \  }}}n<t-          |j        |||d|                    |j
        d         d                    \  }}}t/          ||||||                    d          |                    |          |dk    s|dk    r|nd|	  	        \  }}|                     |           |                    |           |
                    |           nt7          |j        |||d|                    |j
        d         d                    }t9          j        ||d         df          }t9          j        |          }|d|d         ddf                             |          }t>          j         !                    |j
        d         d|d|z   tE          |          |j        |#                                d          }| $                    |           |j        |j
        d<   |dk    r| %                    |           | &                    |
           | '                    |           | (                                 | S )a  Quant the model with round to nearst method.

    Args:
        model (ModelProto or ONNXModel): onnx model
        weight_config (dict): quantization config
                For example,
                weight_config = {
                    'fc2':
                        {
                            'bits': 4,
                            'group_size': 32,
                            'scheme': 'sym',
                            'algorithm': 'RTN'
                        }
                }
        num_bits (int, optional): num_bits. Default is 4.
        group_size (int, optional): how many elements share one scale/zp. Default is 32.
        scheme (str, optional): sym or asym. Defaults to "asym".
        ratios (dict, optional): percentile of clip. Defaults to {}.
        accuracy_level (int): accuracy level. Support 0 (unset),1(fp32), 2(fp16), 3(bf16), or 4(int8).
        providers (list): providers to use

    Returns:
        model: fake quantized ONNXModel
    N c                 $    g | ]}|j         d v |S )MatMul)rL   rf   rg   s     rV   rj   z rtn_quantize.<locals>.<listcomp>  s$    III1j1H1HQ1H1H1HrX   r   r   r   fp32)base_dirr   r"   rA   r}   r   r   r
   r   r^   r   rZ   	r>   r?   r@   rA   rB   rC   rD   rE   r$   r[   r   r   Tr   ))r   
model_pathospathdirnamelennodesrL   r   get_initializerr+   r   r   r   to_arraycopyr*   r   get_initializer_share_numr   r   Tr   rW   r6   add_initializersr5   r   r(   r.   	transposer1   r2   r3   r   r4   add_initializerremove_initializer	add_nodesremove_nodestopological_sort)modelweight_configr@   rA   r}   ratiosr$   	providers	algorithmr   	new_nodesr   	total_numcurr_idr>   weight_tensorr   r   r   rB   init_share_numsatisfy_MatMulNBits_conditionrC   rD   r   q_matmul_noderJ   rT   s                               rV   rtn_quantizer     sv   H eE494D4Prwu/000VXHILIIIIIJJIG E8 E8<:%%qLG	7333LJ&&%%djm44@!!$)R00F::!11$*Q-@@M!*=8LLLQQSSF6<  A%%LEyM))(3F;*495lC
&ty1(; ,K'1R'7'7[^J#A*z9A=H"<<TZ]KKN
H==F,4M,JX]), $5	))*CFHhXb*c*c'HeRR*6(J

SWS]^_S`bcHdHd+ +'HeR ,H!,%)%%__W55,,u--%+v%5%5i9O9OrrUY#1
, 
, 
,(y &&y111##D)))  ////%fh*feU[U_U_`d`jkl`mopUqUqrr:hQ0DEE<11#$4k!n$4aaa$78??FF"&+"9"9A)Ih)I)I:)I)II6u==!))++ #: # # %%o666 / 4
1""((777	OOI	|$$$	LrX   c           
         | j         }|dk    rt          j        | d|f          n| } t          j        t          j        t          j        |           t          j        t          j        |           dd          z  |          d          }|S )zGet the scale of weight.r   r   Tr`   r   ra   )r*   r(   r.   meanrp   rn   )r   rA   r   rD   s       rV   get_weight_scaler     s|    I5?25E5ERZZ 01116FGBJrvf~~rvf~~AX\0]0]0]]_hiipqrrrELrX   c                    ddl m} ddlm t	          j                    }t          j        dk     r. |d          r#ddlm	} |
                     |                       | j        r&t          j        | j        | j        dz   d	d	d
           | j        s.t	          j        | j                                        ||          nt	          j        | j        dz   ||          }d |                                D             }~g }	t'          |          D ]\  }
}|dk    r|
dz   |j        z  |k    r nt+          |          dk    st-          |d         t.                    rVt+          |d                   t+          |          k    s0J dt+          |           dt+          |d                                t-          |d         t.                    rH|	                    t/          fd|d                                         D                                  t-          |d         t4          j                  rE|	                    t/          d t9          ||d         gd
          D                                  l|	                    t/          fdt9          ||d         d
          D                                  |	|fS )as  Prepare inputs for weight only quantization.

    Args:
        model (ModelProto or ONNXModel): onnx model
        n_samples (int, optional): calibration sample number. -1 means all samples.
        dataloader (object): dataloader for calibration.
        providers (list): providers to use

    Returns:
        inputs: prepared inputs.
        so: session options
    r   )	find_specr   )to_numpy)      onnxruntime_extensions)get_library_path_augment.onnxTFsave_as_external_dataall_tensors_to_one_fileconvert_attributer   c                     g | ]	}|j         
S rd   r   r   s     rV   rj   z"prepare_inputs.<locals>.<listcomp>G  s    999qAF999rX   r   zInput number mismatch, require z	 but get c                 0    g | ]\  }}| |          fS rd   rd   )rf   r   inp_datar   s      rV   rj   z"prepare_inputs.<locals>.<listcomp>T  s,    aaa~tXxx'9'9 :aaarX   c                     g | ]	\  }}||f
S rd   rd   )rf   r   inps      rV   rj   z"prepare_inputs.<locals>.<listcomp>V  s     hhh	cshhhrX   strictc                 0    g | ]\  }}| |          fS rd   rd   )rf   r   r   r   s      rV   rj   z"prepare_inputs.<locals>.<listcomp>X  s*    ppp)$xx}} 5ppprX   )importlib.utilr   utilr   ortSessionOptionssysversion_infor   r   register_custom_ops_libraryis_large_modelr1   
save_modelr   r   InferenceSessionSerializeToString
get_inputs	enumerate
batch_sizer   
isinstancedictr5   itemsr(   ndarrayzip)r   	n_samples
dataloaderr   r   sor   sessioninputs_namesr%   rg   r|   r   s               @rV   prepare_inputsr  #  s
    )(((((				B
'!!ii0H&I&I!;;;;;;
&&'7'7'9'9::: 
K."&$(#	
 	
 	
 	
 #	_U[::<<bIVVVV!%"2_"DbT]^^^ 
 :9G$6$6$8$8999LFZ(( s s4??Q**? ?9LLE|!!ZQ%>%>!tAw<<3|#4#4444\#l2C2C\\cRVWXRYll\\ 544 d1gt$$ 	sMM$aaaaQUVWQXQ^Q^Q`Q`aaabbccccQ,, 	sMM$hhSPTUVPWyaf=g=g=ghhhiijjjjMM$pppps<Y]^_Y`inGoGoGopppqqrrrr2:rX      {Gz?FTc
                   	#$%& d|z  dz
  $d#d%d&#$%&	fd}
| j         } |
|           \  }}t          j        |          dk    }d|||f<   d| |ddf<   |rQt          j        t          j        |                    ddd	         }| |ddf         } ||ddf         dd|f         }t          j        |           }t          j        |           }|t          j        t          j        |                    z  }t          j        |d                   }|||fxx         |z  cc<   t          j                            t          j        	                    |                    j
        }|}t          d|d         |          D ]}t          ||z   |d                   }||z
  }t          j        | ||ddf                   }t          j        |          }t          j        |          }t          j        |          }|||||f         }t          |          D ]2}||ddf         }|||f         }|d	k    r/||z   |z  dk    r# |
| ||z   ||z   |z   ddf                   \  }}|t          j        t          j        |ddt          j        f         |z            |z   d$          |z
  z                                  } | ||ddf<   || z
  dz  |dz  z  ||ddf<   || z
  |z  }!||dddfxx         t          j        t          j        ||d|f         d
          t          j        |!d
                    z  cc<   |!||ddf<   4||||ddf<   |dz  |||ddf<   | |dddfxx         t          j        ||d||f         |          z  cc<   |r t          j        |          }"||"ddf         }t          j        || j                   }~ |S )a  Quant the weight with GPTQ method.

    Args:
        W (array): weight.
        H (array): Hessian matrix.
        num_bits (int, optional): num_bits. Default is 4.
        group_size (int, optional): how many elements share one scale/zp. Default is 32.
        scheme (str, optional): sym or asym. Defaults to "asym".
        blocksize (int, optional): blocksize to quantize weight.
        percdamp (float, optional): percent of the average Hessian diagonal to use for dampening.
        actorder (bool, optional): whether rearrange Hessian matrix considering the diag's value.
        mse (bool, optional): whether get scale and zero point with mse error.
        perchannel (bool, optional): whether quantize weight per-channel.

    Returns:
        Q: fake quantized weight
    r   r   d   g?g333333@c                    | j         }s(t          j        |                                 d          } t          j        | j         d                   }t          j        t          j        | d          |          }t          j        t          j        | d          |          }dk    rMt          j        t          j	        |          |          }|dk     }t          j
        |          r||          ||<   |dk    |dk    z  }d||<   d||<   ||z
  z  }dk    r#t          j        |j                   dz   z  dz  }nt          j        | |z            }r6t          j        | j         d         g          t          d          z  }t          t          z                      D ]}d|z  z
  }	|	|z  }
|	|z  }||
z
  z  }dk    rt          j        |
 |z            n|}t          j        t          j        | |z            |z   d          }|| z  }t          j        t          j	        |                    }t          j        |d          }||k     }t          j
        |          r!||         ||<   ||         ||<   ||         ||<   s2|d         }t          j        ||          }t          j        ||          }dgdgt)          |          dz
  z  z   }t          j        ||          }t          j        ||          }||fS )Nr   r   r   r_   r   r   inf)r*   r(   expand_dimsrt   r)   rw   rm   ro   rn   rp   anyrq   rv   re   r   r[   r{   powerr   repeatr   r.   )r   r   tmpxminxmaxrD   zerobestrg   pxmin1xmax1scale1zero1qerrr*   gridrh   	maxshrinkmsenorm
perchannelr}   s                    rV   find_paramszgptq.<locals>.find_params~  s   L	 	>^FNN$4$41===Fhv|A''z"&a000#66z"&a000#66U??:bfTllD11D(Cvc{{ '!#YJS	qyTQY'S	S	$U??75;''4!84q8DD8TEEM**D 	+7FLO,--e<D3y4/0011 + +DLDD%-4/5;u__%&111$GBHVf_55=q$GGVHRVAYY--fQllDj6#;; + #CDI!'E#J %c
DI 	(A,CIeS))E9T3''Dsc)nnq011
5%((z$&&d{rX   r   Nr   r   )r*   r(   diagargsort
zeros_liker   r8   linalgcholeskyinvr   r   rm   r   deepcopyr{   rv   newaxisrt   matmulr#  r.   )'WHr@   rA   r}   	blocksizepercdampactorderr5  r7  r8  r*   rD   r   deadpermLossesQdampr9  Hinvi1i2countW1Q1Err1Losses1Hinv1rg   wdr1  err1invpermr3  rh   r4  r6  s'       `   ``                         @@@@rV   gptqrY  \  sk   : h;?DDID. . . . . . . . . . .` GEAIE271::?DAdDjMAdAAAgJ   z"'!**%%ddd+dAAAgJdAAAgJqqq$w]1F
aAbgbgajj)))D9U1XDdDjMMMTMMM
	29==++,,.ADAuQx++ 7 7iq**R]1RUAAAX;'']2}R  -##RUBrE\"u 	 	A1aaa4AadARFj(A-- +ArAv"q&::M.NPQPQPQ.Q,R S SIE2"'"(1QQQ
]+;e+C"D"Dr"I1dSSVXXYbbddABq!!!tHUqL1a4/GAqqqDMEQ;Dqrr111uIII2>%A,Q#G#G#GX\cdIeIeIefffIIIDAAAJJ"R%("Q;r"uaaax	"##qqq&			RYtBCCBJ/666				 *T""gqqqjM

1agA	HrX   c                   *+, t          |           } | j        $t          j                            | j                  nd}t          | |||          \  }}~t          j        | j        j	        j
                  }|                     d |D                        g }|                                 D ]}}|j        dv rr|                    |j        i           dk    rS|                    |j        i                               dd          dk    r |                    |j        d                    ~t%          t'          |                    }|                     |           | j        r&t-          j        | j        | j        d	z   d
d
d           | j        s.t1          j        | j                                        ||          nt1          j        | j        d	z   ||          }t7          |          D ]x\  }}t9          t;          |          |dz              g }g }| j        |         D ]8}|j        dv r+|                    |j        i           dk    r|                    |j        i                               dd          dk    r|                     |j        d                   tA          j!        |                     | "                    |j                  j        d                   |                                          }t;          |j#                  dk    r|                    |           |                    | "                    |j                             :t;          |          dk    rd |D             }d+|D ]}|$                    |g|          d         **j#        d         ,tK          j&        *d*j#        d         f          *+,fd|D             }+,z  +tK          j'        d+z            *z  **fd|D             }tQ          |||d          D ]B\  }}}|j        |v r9||j                 d         }||j                 d         }||j                 d         }|dk    r|n|j#        d         }|j)        }tU          ||||||||	|
|
  
        }|                     |j        d                   }| +                    |j        d                   } |dk    }!|!r|j#        }"|"d         |z   dz
  |z  }#tY          |||#          }t[          |j.        |||d          \  }}$}%t_          ||"|||#|0                    d          |$0                    |          |dk    r|%nd|	  	        \  }&}'| 1                    |'           | 2                    |           | 3                    |&           nt,          j4        5                    |j        d         d|d|z   tm          |          |j#        |0                    |          7                                d
           }(| 8                    |(           |(j        |j        d<   | dk    r| 9                    |           Dz|                     |           | j        j	        j
        :                    |           | ;                                 | j        r?dd!l<m=})  |)| j        t          j        >                    | j                  d                    | S )"a  Quant the model with GPTQ method.

    Args:
        model (ModelProto or ONNXModel): onnx model
        dataloader (object): dataloader for calibration.
        weight_config (dict): quantization config
                For example,
                weight_config = {
                    'fc2':
                        {
                            'bits': 4,
                            'group_size': 32,
                            'scheme': 'sym',
                            'algorithm': 'GPTQ'
                        }
                }
        num_bits (int, optional): num_bits. Default is 4.
        group_size (int, optional): how many elements share one scale/zp. Default is 32.
        scheme (str, optional): sym or asym. Defaults to "asym".
        n_samples (int, optional): calibration sample number.
        percdamp (float, optional): percent of the average Hessian diagonal to use for dampening.
        blocksize (int, optional): blocksize to quantize weight.
        actorder (bool, optional): whether rearrange Hessian matrix considering the diag's value.
        mse (bool, optional): whether get scale and zero point with mse error.
        perchannel (bool, optional): whether quantize weight per-channel.
        accuracy_level (int): accuracy level. Support 0 (unset), 1(fp32), 2(fp16), 3(bf16), or 4(int8).
        providers (list): providers to use

    Returns:
        model: fake quantized ONNXModel
    Nr   c                     g | ]	}|j         
S rd   r   r   s     rV   rj   z!gptq_quantize.<locals>.<listcomp>  s    &B&B&B!qv&B&B&BrX   r   r   r   GPTQr   r   TFr   r   r   r   c                 f    g | ].}t          j        |j        d          |j        d          f          /S r   )r(   r)   r*   r   s     rV   rj   z!gptq_quantize.<locals>.<listcomp>O  s3    BBBQbh
AGAJ/00BBBrX   r   c                 &    g | ]}|z   z  z  S rd   rd   )rf   rg   nsamplesr'  s     rV   rj   z!gptq_quantize.<locals>.<listcomp>U  s'    @@@!x8c>23@@@rX   c                 J    g | ]}|t          j        j                  z    S rd   )r(   rA  r   )rf   rg   r   s     rV   rj   z!gptq_quantize.<locals>.<listcomp>X  s,    888!bis+++888rX   r  r"   rA   r}   )r@   rA   r}   rD  rE  rF  r5  r7  r   r^   r   rZ   r   r   r   r   )load_external_data_for_model)?r   r   r   r   r   r  r   r?  r   graphr<   remove_tensors_from_outputsr   rL   r   r   r5   r+   listsetadd_tensors_to_outputsr  r1   r  r  r  r  r  r   r   input_name_to_nodesr   r   r   get_noder*   runr(   r.   r   r  r   rY  r   r   r   r   rW   r6   r   remove_nodeadd_noder2   r3   r   r4   r   r   	MergeFromr   onnx.external_data_helperrb  split)-r   r  r   r@   rA   r}   r  rE  rD  rF  r5  r7  r$   r   r   r%   r  
org_outputoutput_namesr>   r  rP   
input_name	node_listr   r   Hsr|   rC  r   rC   r   r   r   r   rB   rD   r   r   rJ   rT   rb  r   r`  r'  s-                                             @@@rV   gptq_quantizeru    sG   ^ eE494D4Prwu/000VXHy*iHHJFBu{0788J	%%&B&Bz&B&B&BCCCL / /LJ&&!!$)R00F::!!$)R0044[&IIVSS
1...L))**L	  ... 
K."&$(#	
 	
 	
 	
 #	_U[::<<bIVVVV!%"2_"DbT]^^^  %\22 `8 `8ZC--sQw777	-j9 	< 	<D
**!%%di44>>!%%di4488fMMQWWW))$*Q-88D%.))%..*C*C*I!*LMMx $&&  v|$$))v&&&  	!:!:;;;w<<1BB'BBB 	9 	9D++zlD11!4C)A,C*S2sy}"566C@@@@@R@@@BOH'!h,''#-C8888R888BB GR666	=	8 =	8 
yM))(3F;*495lC
&ty1(;'1R'7'7V\!_JLE!%#!!%  H "11$*Q-@@M"<<TZ]KKN,4M), 5"L	%aL:59jH%h
HEE&28:xU[]c&d&d#%+G!*%)%%__W55,,u--%+v%5%5rr4#1
, 
, 
,(y &&y111!!$'''}----"&+"9"9A)Ih)I)I:)I)II6u==!!//7799 #: # # %%o666 / 4
1""((777{=	8~ 
%%l333	K&&z222	  VJJJJJJ$$U["'--@P2Q2QRS2TUUULrX   r^  )r   rY   rZ   r[   r\   )r   rY   )r   rY   rZ   r  r  FFT)__doc__r   loggingr   r  numpyr(   r1   r   onnx.helperr   onnxruntimer  
onnx_modelr   r  r   	getLoggerr,   rW   r   r   r   r   r   r   r   r  rY  ru  rd   rX   rV   <module>r}     s  0 % $   				 



            0 0 0 0 0 0     ! ! ! ! ! ! % % % % % %		.	/	/ p. p. p. p.f3' 3' 3' 3'lF' F' F' F'RXD XD XD XDv8 8 8 8&  4 %&t t t tn  6 6 6x I I I I^ %&| | | | | |rX   