
    PiA                     V   d dl Z d dlmZ d dlmZmZ d dlZd dlmZm	Z	 d dl
mZ d dlZd dlmZ d dlmZmZ d dlmZmZmZmZmZmZmZmZmZmZmZ dd	ej        ddd
ddddd
d
fdedee         dee          dee         dee         deee                  dee          dee          de!de!fdZ"e#dk    rd dl$Z$ e$j%        d          Z&e&'                    de ed          d           e&'                    ddedgd !           e&'                    d"e dd#           e&'                    d$d% ej        d&           e&'                    d'ed	d(           e&'                    d)d*ed+,           e&'                    d-ed.,           e&'                    d/d0d12           e&'                    d3e dd4           e&'                    d5eddgd67           e&'                    d8e d9d:           e&'                    d;e d<d=           e&'                    d>e!d
d?           e&'                    d@d0dA2           e&(                                Z) e"e)j*        e)j+        e)j,        e)j-        e)j.        e)j/        e)j0        e)j1        e)j2        e)j3        e)j4        e)j5        e)j6        e)j7                   dS dS )B    N)Path)ListOptional)_load_modeldevice_sync)get_tokenizer)prepare_inputs_for_model)!MXDynamicActivationMXWeightConfig'NVFP4DynamicActivationNVFP4WeightConfig))Float8DynamicActivationFloat8WeightConfigFloat8WeightOnlyConfigFPXWeightOnlyConfigInt4WeightOnlyConfig%Int8DynamicActivationInt8WeightConfigInt8WeightOnlyConfigPerBlockPerRow	PerTensorUIntXWeightOnlyConfig	quantize_cudaFcheckpoint_pathtaskslimitquantizationsparsitycalibration_taskscalibration_limitcalibration_seq_lengthpad_calibration_inputsprint_modelc                   45 t          d|  d| d| d| d| dd| d| d	| d
| d|	 dz   d| d| dz              t          j        j                                         |                                 s
J |             | j        dz  }|                                sJ t          |                      t          d           t          j                    }t          | d|          }||j
        j        }t          |           t          dt          j                    |z
  dd           t          ||           }|r>d|v rddlm}  ||           d|v rt!          |t#                                 d|v rt!          |t%                                 d|v rt!          |t'          dd                     d|v ryd |vrud!|v rd"}nd#}t)          |                    d$          d%                   }|d&v sJ d'|             t!          |                    |          t/          ||d%(                     d)|v rd!|v rd"}nd#}|                    d$          }t)          |d%                   }t0          j        t0          j        t0          j        t0          j        t0          j        t0          j        t0          j        t0          j         d*}||         }t)          |d                   }t!          |tC          |||+                     d,|v r-dd-l"m#} t!          |t/           |            d%.                     d|v rd |v rdd/l$m%} dd0l&m'} t)          |                    d$          d1                   }|d&v sJ d'|             |t0          j(        k    sJ | d2|             d3|v s
J d4             |||tR          |j
        j*        |d          +                    |	|
          ,                                }t          d5            |||6          }|-                    d%|7            |j.        |g|R   |                    |          }d8|v rt!          |t_                                 d9|v rt          |                    d$          d:                   }|d;k    rta                      }n<|d<k    rtc                      }n'|d9k    rta                      }nte          d=|           t!          |tg          |>          d? @           |dAk    rAtg          ti          d%dBg          ti          dBdBg          fdCD          } t!          ||            |dEk    r8tk          t0          j6        t0          j6        F          } t!          || dG @           |dHk    r$to          d"d"I          } t!          || dJ @           dK|v riddLl8m9}! ddMl:m;5 ddNl<m=}" |!>                    | j                  }#|                    d$          }g dO}$t          |          d%k    r|d%         n|}%|dd         }dP |D             |$t          |          d         z   \  }&}'}}(})}*}+},|                    |%          }t          dQ|' dR| dS|& dT|( dU|) dV|* dW|+ dX|, dY           t1          j@        |%          5  |-                    |(|)d"Z           ddd           n# 1 swxY w Y   |&r5fd[}-n5fd\}- |"||#|-d]|)|(|'|*|+|,d%k    ^
  
         |                    |           |A                                 d_|v rBdd`lBmC}. |                    |           t!          | |.t0          j        dab                     n|D                    dc          rdddl$mE}/ ddelFmG4mH}0mI}1 |                    d$          d%         }2t)          |                    d$          d                   }t          t0          |2t0          j                   }2|                    |          } |1|d%df|2|g            |/|                    |          |dftR          |h          K                    digd%j           4fdk}3d!|v }t!          | |0|2||l          |3           |r0|dmv rt1          jL        |          }nt1          jL        |dnd"o          }|rt          |           t1          jM                    5  t          dp           dddl$mE}/  |/|                    |          ||tR          |h          K                    ||j           ddd           dS # 1 swxY w Y   dS )qz-Runs the evaluation of a model using LM Eval.z
Evaluating model z on tasks: z	, limit: z
, device: z, precision: z, zquantization: z, sparsity: z, compile: z, max_length: z, calibration_tasks: zcalibration_seq_length: z, pad_calibration_inputs: 
ztokenizer.modelzLoading model ...cpuN)devicezTime to load model: z.02fz seconds	spinquantr   )apply_spinquantint8woint8dqfp6      int4wogptqhqqTF-   )    @         z<int4wo groupsize needs to be one of [32,64,128,256] but got )
group_sizeuse_hqqversionuintx)r1   r,   r+                  )r7   marlin)MarlinSparseLayout)layoutr8   )LMEvalInputRecorder)Int4WeightOnlyGPTQQuantizerz( requires precision or bfloat16 but got r   z)int4 gptq quantization only works on cudaz%Obtained inputs, starting calibration)r6   r%   )max_batch_sizemax_seq_lengthfloat8wofloat8dqtensorrowzUnknown granularity )granularityc                 L    t          | t          j        j                  o|dk    S Noutput
isinstancetorchnnLinearmodfqns     n/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/torchao/_models/llama/eval.py<lambda>z run_evaluation.<locals>.<lambda>   #    :c58?+K+K ,$8O     )	filter_fnfloat8_a1x128_w128x128r4   g-q=)rL   activation_value_lbmxfp8)activation_dtypeweight_dtypec                 L    t          | t          j        j                  o|dk    S rN   rP   rU   s     rX   rY   z run_evaluation.<locals>.<lambda>   rZ   r[   nvfp4)use_dynamic_per_tensor_scaleuse_triton_kernelc                 L    t          | t          j        j                  o|dk    S rN   rP   rU   s     rX   rY   z run_evaluation.<locals>.<lambda>   rZ   r[   	autoround)AutoTokenizer)TransformerBlock)quantize_model_with_autoround_)F   r4   r>   i   r4   r1   r   c                 ,    g | ]}t          |          S  )int).0xs     rX   
<listcomp>z"run_evaluation.<locals>.<listcomp>   s    ---AQ---r[   z&Quantizing model with autoround(iters=z, groupsize=z, quant_lm_head=z, batch_size=z	, seqlen=z, nsamples=z, gradient_accumulate_steps=z, compile_optimization_process=))rE   rF   trainingc                 ,    t          |           pd|v S rN   rQ   rV   rW   ri   s     rX   rY   z run_evaluation.<locals>.<lambda>   s!    Z5E%F%F &'3 r[   c                 $    t          |           S Nru   rv   s     rX   rY   z run_evaluation.<locals>.<lambda>  s    JsDT4U4U r[   r:   )
model	tokenizeris_target_modulebitsseqlen
batch_sizeitersnsamplesgradient_accumulate_stepscompile_optimization_processcodebook)codebook_weight_onlyr3   )dtypescale_block_sizez	awq-uintx)TransformerEvalWrapper)AWQObservedLinear	awq_uintxinsert_awq_observer_r5   )quant_dtyper6   )ry   rz   rF   input_prep_funcr%   wikitext)r   r   c                 $    t          |           S rx   ru   )mrW   r   s     rX   rY   z run_evaluation.<locals>.<lambda>3  s    
1>O0P0P r[   )r   r6   r7   )r]   r_   rc   zmax-autotune)mode	fullgraphzRunning evaluation ...)Nprinttorchaor   utils"recommended_inductor_config_setteris_fileparentstrtimer   config
block_sizer   r   torchao.prototype.spinquantr'   r   r   r   r   rn   splittor   rR   uint1uint2uint3uint4uint5uint6uint7uint8r   torchao.dtypesr@   torchao._models._evalrB   torchao.quantization.GPTQrC   bfloat16r	   
vocab_sizerecord_inputsget_recorded_inputssetup_cachesquantizer   r   r   
ValueErrorr   r   r
   float8_e4m3fnr   transformersrh   torchao._models.llama.modelri   )torchao.prototype.autoround.autoround_llmrj   from_pretrainedlenr%   reset_caches'torchao.prototype.quantization.codebookr   
startswithr   torchao.prototype.awqr   r   r   getattrrun_evalcompileno_grad)6r   r   r   r%   	precisionr   r   r   
max_lengthr   r   r   r    r!   tokenizer_patht0ry   rz   r'   r7   	groupsize_quant_argsnbits_NBITS_TO_DTYPEr   r6   r@   rB   rC   inputs	quantizerrL   r   rh   rj   
_tokenizer_default_quant_args_model_deviequant_lm_headr   r~   r}   r   grad_acc_stepsr   r{   r   r   r   r   r   is_observed_linearr   ri   s6                                                       @@rX   run_evaluationr   &   sJ   " 
}o}}%}}%}}[a}}py}}} X<  X  XX  X  X'  X  Xak  X  X  CT  X  X  X	X
q%;
q
qWm
q
q
q	r  
 AACCC""$$55o55$$+.??N!!##88S%8%888#	
	B	::E\,
v	
@r!1
@
@
@
@AAAno>>I o,&&CCCCCCOE"""|##e133444|##eBDDEEEL  e0A66777|##Fl,B,B$$L..s33A677I 2222ZyZZ 322   $	7TUVVV   l"" $$&,,S11KA''E;;;;;;;;	 	O $E*E[^,,Je25*gVVVWWW|##999999+3E3E3G3GQRSSS   |##,(>(>AAAAAAMMMMMML..s33B788I 2222ZyZZ 322 ...TTTT /.. V###%P#####*,L+*    %%  %$&&  9:::33yQWXXXIa@VWWWIu.v....HHV$$E%%e355666%%l0055b9::Kh&&'kk%%$hh*,,"+++KK$%IK%I%IJJJ9kRRR$ $    333>%q#h//3*1E1EF$)  F eV$$$7""6!&!4"0  F $ $    7""<-1"&  F $ $    ,&&222222DDDDDD      '667MNNJ ',,S11K"G"G"G-0-=-=-A-A;q>>vL%abb/K .----0CCDTDTDVDV0WW	,HH\**ETU T TPY T T%2T TAKT TV\T TiqT T1?T T 5QT T T   l++  ""#-ft #                 
  V' ' ' ' ! 
 $V#U#U#U **$!1%!*8-IQ-N    HHV   %%TTTTTTHHV++%+PRSSS    $$[11 "	DDDDDD          ',,S11!4K\//44Q788J!%ekBBKHHV$$E  q#;:    #"hhv&&#" 8   h!l     "Q!P!P!P|+G	 +
G   #    NGGGM%((EEM%nMMME e	 
 
&'''@@@@@@((6""%4	
 	
 	
 (  
 
 

 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
s%   )[[[Ad55d9<d9__main__zRun HF Model Evaluation)descriptionz--checkpoint_pathz<../../../checkpoints/meta-llama/Llama-2-7b-chat-hf/model.pthzModel checkpoint path.)typedefaulthelpz--tasks+r   z?List of lm-eluther tasks to evaluate usage: --tasks task1 task2)nargsr   r   r   z--limitz"Number of eval samples to evaluatez--precisionc                 ^    t          t          |                     d          d                   S )N.rI   )r   rR   r   )rp   s    rX   rY   rY   o  s    wuaggcll2&677 r[   zdtype precision to usez--devicezDevice to use for evaluationz-qz--quantizationa  Which quantization techniques to apply: int8dq, int8wo, fp6, int4wo-<groupsize>, int4wo-<groupsize>-gptq, autoquant, autoquant-int4, int4wo-<groupsize>-hqq, uintx-<nbits>-<groupsize>, uintx-<nbits>-<groupsize>-hqq, sparse-marlin, spinquant, autoround-<model_device>-<quant_lm_head>-<iters>-<groupsize>-<batch_size>-<seqlen>-<nsamples>-<grad_acc_steps>-<c>, float8wo, float8dq, float8saq)r   r   z
--sparsityz3Which sparsity techniques to apply: semi-structuredz	--compile
store_truezWhether to compile the model.)actionr   z--max_lengthz%Length of text to process at one timez--calibration_tasksz.tasks to do gptq calibration on, if doing gptq)r   r   r   r   z--calibration_limiti  z-number of samples to use for gptq calibrationz--calibration_seq_lengthd   z/length of sequences to use for gptq calibrationz--pad_calibration_inputsz{pads sequences shorter than calibration_seq_length to that length, yielding more calibration inputs but running much slowerz--print_modelzWhether to print the model.)8r   pathlibr   typingr   r   rR   generater   r   rz   r   r   r   r	   /torchao.prototype.mx_formats.inference_workflowr
   r   torchao.quantizationr   r   r   r   r   r   r   r   r   r   r   r   r   rn   boolr   __name__argparseArgumentParserparseradd_argument
parse_argsargsr   r   r   r%   r   r   r   r   r   r   r   r   r    r!   rm   r[   rX   <module>r      s1          ! ! ! ! ! ! ! !         $ # # # # #  @ @ @ @ @ @                                $  n"&"-1'+,0#(p
 p
p
9p
 C=p
 3-p
 smp
  S	*p
  }p
 %SMp
 !p
 p
 p
 p
 p
f	 zOOO$X$1JKKKF
STT%	     N     T0T     77%	     f3Q     ,     C    
 L/N     4	     =     <	     ">	     " K	     3P     DN

##    k r[   